set.seed(42)
library(rcompanion) # effect size calculation
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(corrplot)
## corrplot 0.95 loaded
library(QuantPsyc) # for the multivariate normality test
## Loading required package: boot
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: purrr
##
## Attaching package: 'purrr'
## The following objects are masked from 'package:igraph':
##
## compose, simplify
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
library(dunn.test)
library(nFactors) # for the scree plot
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
library(psych) # for PA FA
##
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
##
## logit
## The following object is masked from 'package:rcompanion':
##
## phi
library(caret) # highly correlated features removal
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ readr 2.1.5 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%() masks igraph::%--%()
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ tibble::as_data_frame() masks dplyr::as_data_frame(), igraph::as_data_frame()
## ✖ purrr::compose() masks igraph::compose()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ caret::lift() masks purrr::lift()
## ✖ MASS::select() masks dplyr::select()
## ✖ purrr::simplify() masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(paletteer) # color palettes
library(conflicted) # to resolve QuantPsyc x dplyr conflicts
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
analyze_distributions <- function(data_factors_long, variable) {
factors <- levels(data_factors_long$factor)
print(table(data_factors_long[[variable]], useNA = "ifany") / length(factors))
plot <- data_factors_long %>%
ggplot(aes(x = factor_score, y = !!sym(variable))) +
geom_boxplot() +
facet_grid(factor ~ .) +
labs(x = "factor score") +
theme_bw()
ggsave(paste(c("distr", variable, ".pdf"), collapse = ""))
print(plot)
# formula <- reformulate(variable, "factor_score")
chi2 <- numeric()
p_val <- numeric()
epsilon2 <- numeric()
epsilon2_lci <- numeric()
epsilon2_uci <- numeric()
min_p_values <- numeric()
for (f in factors) {
data <- data_factors_long %>% filter(factor == f)
cat(
"\nTest for the significance of differences in",
variable, "over", f, ":\n\n"
)
kw <- kruskal.test(data$factor_score, data[[variable]])
dunn <- dunn.test(
data$factor_score, data[[variable]],
altp = TRUE, method = "bonferroni"
)
e2_test <- epsilonSquared(data$factor_score, data[[variable]], ci = TRUE)
e2 <- e2_test[[1]]
e2_lci <- e2_test[[2]]
e2_uci <- e2_test[[3]]
cat("epsilon2 = ", e2, "(95% CI:", e2_lci, "-", e2_uci, ")\n")
min_p_values <- c(min_p_values, min(dunn$altP.adjusted))
chi2 <- c(chi2, kw$statistic[[1]])
p_val <- c(p_val, kw$p.value)
epsilon2 <- c(epsilon2, e2)
epsilon2_lci <- c(epsilon2_lci, e2_lci)
epsilon2_uci <- c(epsilon2_uci, e2_uci)
}
cat("\n")
print(
data.frame(
factor = factors,
chi2 = chi2,
kruskal_p = p_val,
epsilon2_lci = epsilon2_lci,
epsilon2 = epsilon2,
epsilon2_uci = epsilon2_uci
) %>% mutate(
across(c(epsilon2, epsilon2_lci, epsilon2_uci), ~ round(.x, 3))
) %>%
mutate(across(kruskal_p, ~ case_when(
.x < 0.0001 ~ "< .0001",
.x < 0.001 ~ "< .001",
.x < 0.01 ~ "< .01",
.x < 0.05 ~ "< .05",
.default = as.character(round(.x, 2))
))) %>%
mutate(across(chi2, ~ round(.x, 2)))
)
cat(
"\np < 5e-2 found in:",
factors[min_p_values < 0.05],
"\np < 1e-2 found in:",
factors[min_p_values < 0.01],
"\np < 1e-3 found in:",
factors[min_p_values < 0.001],
"\np < 1e-4 found in:",
factors[min_p_values < 0.0001], "\n"
)
}
data_factor_bind <- function(data, fa_fit) {
data_factors <- bind_cols(data, fa_fit$scores %>% as.data.frame())
colnames(data_factors) <- prettify_feat_name_vector(colnames(data_factors))
fnames <- colnames(fa_fit$loadings)
data_factors_long <- data_factors %>%
pivot_longer(
any_of(fnames),
names_to = "factor", values_to = "factor_score"
) %>%
mutate(across(
factor,
~ factor(.x, levels = fnames)
)) %>%
select(
all_of(1:(.firstnonmetacolumn - 1)), factor, factor_score, everything()
)
data_factors_longer <- data_factors_long %>% pivot_longer(
all_of((.firstnonmetacolumn + 2):ncol(data_factors_long)),
names_to = "feat", values_to = "feat_value"
)
return(list(
data = data_factors,
long = data_factors_long,
feat_long = data_factors_longer
))
}
pretty_names <- read_csv("../feat_name_mapping.csv")
## Rows: 85 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name_orig, name_pretty
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
prettify_feat_name <- function(x) {
name <- pull(pretty_names %>%
filter(name_orig == x), name_pretty)
if (length(name) == 1) {
return(name)
} else {
return(x)
}
}
prettify_feat_name_vector <- function(x) {
map(
x,
prettify_feat_name
) %>% unlist()
}
data <- read_csv("../measurements/measurements.csv")
## Rows: 753 Columns: 108
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (20): fpath, KUK_ID, FileName, FileFormat, FolderPath, subcorpus, Source...
## dbl (85): RuleAbstractNouns, RuleAmbiguousRegards, RuleAnaphoricReferences, ...
## lgl (3): ClarityPursuit, SyllogismBased, Bindingness
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
.firstnonmetacolumn <- 17
data_no_nas <- data %>%
select(!c(
fpath,
# KUK_ID,
# FileName,
FolderPath,
# subcorpus,
DocumentTitle,
ClarityPursuit,
Readability,
SyllogismBased,
SourceDB
)) %>%
# replace -1s in variation coefficients with NAs
mutate(across(c(
`RuleDoubleAdpos.max_allowable_distance.v`,
`RuleTooManyNegations.max_negation_frac.v`,
`RuleTooManyNegations.max_allowable_negations.v`,
`RuleTooManyNominalConstructions.max_noun_frac.v`,
`RuleTooManyNominalConstructions.max_allowable_nouns.v`,
`RuleCaseRepetition.max_repetition_count.v`,
`RuleCaseRepetition.max_repetition_frac.v`,
`RulePredSubjDistance.max_distance.v`,
`RulePredObjDistance.max_distance.v`,
`RuleInfVerbDistance.max_distance.v`,
`RuleMultiPartVerbs.max_distance.v`,
`RuleLongSentences.max_length.v`,
`RulePredAtClauseBeginning.max_order.v`,
`mattr.v`,
`maentropy.v`
), ~ na_if(.x, -1))) %>%
# replace NAs with 0s
replace_na(list(
RuleGPcoordovs = 0,
RuleGPdeverbaddr = 0,
RuleGPpatinstr = 0,
RuleGPdeverbsubj = 0,
RuleGPadjective = 0,
RuleGPpatbenperson = 0,
RuleGPwordorder = 0,
RuleDoubleAdpos = 0,
RuleDoubleAdpos.max_allowable_distance.v = 0,
RuleAmbiguousRegards = 0,
RuleReflexivePassWithAnimSubj = 0,
RuleTooManyNegations = 0,
RuleTooManyNegations.max_negation_frac.v = 0,
RuleTooManyNegations.max_allowable_negations.v = 0,
RuleTooManyNominalConstructions.max_noun_frac.v = 0,
RuleTooManyNominalConstructions.max_allowable_nouns.v = 0,
RuleFunctionWordRepetition = 0,
RuleCaseRepetition.max_repetition_count.v = 0,
RuleCaseRepetition.max_repetition_frac.v = 0,
RuleWeakMeaningWords = 0,
RuleAbstractNouns = 0,
RuleRelativisticExpressions = 0,
RuleConfirmationExpressions = 0,
RuleRedundantExpressions = 0,
RuleTooLongExpressions = 0,
RuleAnaphoricReferences = 0,
RuleLiteraryStyle = 0,
RulePassive = 0,
RulePredSubjDistance = 0,
RulePredSubjDistance.max_distance.v = 0,
RulePredObjDistance = 0,
RulePredObjDistance.max_distance.v = 0,
RuleInfVerbDistance = 0,
RuleInfVerbDistance.max_distance.v = 0,
RuleMultiPartVerbs = 0,
RuleMultiPartVerbs.max_distance.v = 0,
RuleLongSentences.max_length.v = 0,
RulePredAtClauseBeginning.max_order.v = 0,
RuleVerbalNouns = 0,
RuleDoubleComparison = 0,
RuleWrongValencyCase = 0,
RuleWrongVerbonominalCase = 0,
RuleIncompleteConjunction = 0
)) %>%
# replace NAs with medians
mutate(across(c(
RuleDoubleAdpos.max_allowable_distance,
RuleTooManyNegations.max_negation_frac,
RuleTooManyNegations.max_allowable_negations,
RulePredSubjDistance.max_distance,
RulePredObjDistance.max_distance,
RuleInfVerbDistance.max_distance,
RuleMultiPartVerbs.max_distance
), ~ coalesce(., median(., na.rm = TRUE)))) %>%
# merge GPs
mutate(
GPs = RuleGPcoordovs +
RuleGPdeverbaddr +
RuleGPpatinstr +
RuleGPdeverbsubj +
RuleGPadjective +
RuleGPpatbenperson +
RuleGPwordorder
) %>%
select(!c(
RuleGPcoordovs,
RuleGPdeverbaddr,
RuleGPpatinstr,
RuleGPdeverbsubj,
RuleGPadjective,
RuleGPpatbenperson,
RuleGPwordorder
))
data_clean <- data_no_nas %>%
# norm data expected to correlate with text length
mutate(across(c(
GPs,
RuleDoubleAdpos,
RuleAmbiguousRegards,
RuleFunctionWordRepetition,
RuleWeakMeaningWords,
RuleAbstractNouns,
RuleRelativisticExpressions,
RuleConfirmationExpressions,
RuleRedundantExpressions,
RuleTooLongExpressions,
RuleAnaphoricReferences,
RuleLiteraryStyle,
RulePassive,
RuleVerbalNouns,
RuleDoubleComparison,
RuleWrongValencyCase,
RuleWrongVerbonominalCase,
RuleIncompleteConjunction,
num_hapax,
RuleReflexivePassWithAnimSubj,
RuleTooManyNominalConstructions,
RulePredSubjDistance,
RuleMultiPartVerbs,
RulePredAtClauseBeginning
), ~ .x / word_count)) %>%
mutate(across(c(
RuleTooFewVerbs,
RuleTooManyNegations,
RuleCaseRepetition,
RuleLongSentences,
RulePredObjDistance,
RuleInfVerbDistance
), ~ .x / sent_count)) %>%
# remove variables identified as text-length dependent
select(!c(
RuleTooFewVerbs,
RuleTooManyNegations,
RuleTooManyNominalConstructions,
RuleCaseRepetition,
RuleLongSentences,
RulePredAtClauseBeginning,
syllab_count,
char_count
)) %>%
# remove variables identified as unreliable
select(!c(
RuleAmbiguousRegards,
RuleFunctionWordRepetition,
RuleDoubleComparison,
RuleWrongValencyCase,
RuleWrongVerbonominalCase
)) %>%
# remove further variables belonging to the 'acceptability' category
select(!c(RuleIncompleteConjunction)) %>%
# remove artificially limited variables
select(!c(
RuleCaseRepetition.max_repetition_frac,
RuleCaseRepetition.max_repetition_frac.v
)) %>%
# remove variables with too many NAs
select(!c(
RuleDoubleAdpos.max_allowable_distance,
RuleDoubleAdpos.max_allowable_distance.v
)) %>%
mutate(across(c(
class,
FileFormat,
subcorpus,
DocumentVersion,
LegalActType,
Objectivity,
AuthorType,
RecipientType,
RecipientIndividuation,
Anonymized
), ~ as.factor(.x)))
# no NAs should be present now
data_clean[!complete.cases(data_clean[.firstnonmetacolumn:ncol(data_clean)]), ]
## # A tibble: 0 × 77
## # ℹ 77 variables: KUK_ID <chr>, FileName <chr>, FileFormat <fct>,
## # subcorpus <fct>, SourceID <chr>, DocumentVersion <fct>,
## # ParentDocumentID <chr>, LegalActType <fct>, Objectivity <fct>,
## # Bindingness <lgl>, AuthorType <fct>, RecipientType <fct>,
## # RecipientIndividuation <fct>, Anonymized <fct>, Recipient Type <chr>,
## # class <fct>, RuleAbstractNouns <dbl>, RuleAnaphoricReferences <dbl>,
## # RuleCaseRepetition.max_repetition_count <dbl>, …
colnames(data_clean) <- prettify_feat_name_vector(colnames(data_clean))
feature_importances <- read_csv("../importance_measures/featcomp.csv")
## Rows: 61 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Variable, Sign
## dbl (15): Importance, p_value, estimate, wilcox_p, wilcox_r, kw_p, kw_chi2, ...
## lgl (4): selected_pval, wilcox_sel, kw_sel, selected_reg
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
selected_features_names <- feature_importances %>%
filter(kw_sel) %>%
pull(Variable)
See Levshina (2015: 353–54).
analyze_correlation <- function(data) {
cor_matrix <- cor(data)
cor_tibble_long <- cor_matrix %>%
as_tibble() %>%
mutate(feat1 = rownames(cor_matrix)) %>%
pivot_longer(!feat1, names_to = "feat2", values_to = "cor") %>%
mutate(abs_cor = abs(cor))
cor_matrix_upper <- cor_matrix
cor_matrix_upper[lower.tri(cor_matrix_upper)] <- 0
cor_tibble_long_upper <- cor_matrix_upper %>%
as_tibble() %>%
mutate(feat1 = rownames(cor_matrix)) %>%
pivot_longer(!feat1, names_to = "feat2", values_to = "cor") %>%
mutate(abs_cor = abs(cor)) %>%
filter(feat1 != feat2 & abs_cor > 0)
list(
cor_matrix = cor_matrix,
cor_matrix_upper = cor_matrix_upper,
cor_tibble_long = cor_tibble_long,
cor_tibble_long_upper = cor_tibble_long_upper
)
}
data_purish <- data_clean %>%
# remove readability metrics as they're conceptually different
# to the remaining features
select(!c(ari, cli, fkgl, fre, gf, smog)) %>%
select(any_of(selected_features_names))
what unites the low-communality variables we threw out:
.hcorrcutoff <- 0.9
analyze_correlation(data_purish)$cor_tibble_long %>%
filter(feat1 != feat2 & abs_cor > .hcorrcutoff) %>%
arrange(feat1, -abs_cor) %>%
print(n = 100)
## # A tibble: 4 × 4
## feat1 feat2 cor abs_cor
## <chr> <chr> <dbl> <dbl>
## 1 hpoint wordcount 0.958 0.958
## 2 maentropy mattr 0.964 0.964
## 3 mattr maentropy 0.964 0.964
## 4 wordcount hpoint 0.958 0.958
exclude:
high_correlations <- findCorrelation(
cor(data_purish),
verbose = TRUE, cutoff = .hcorrcutoff
)
## Compare row 7 and column 6 with corr 0.958
## Means: 0.179 vs 0.186 so flagging column 6
## Compare row 20 and column 15 with corr 0.964
## Means: 0.166 vs 0.187 so flagging column 15
## All correlations <= 0.9
names(data_purish)[high_correlations]
## [1] "hpoint" "mattr"
data_pureish_striphigh <- data_purish %>% select(!all_of(high_correlations))
analyze_correlation(data_pureish_striphigh)$cor_tibble_long %>%
filter(feat1 != feat2 & abs_cor > .hcorrcutoff) %>%
arrange(feat1, -abs_cor) %>%
print(n = 100)
## # A tibble: 0 × 4
## # ℹ 4 variables: feat1 <chr>, feat2 <chr>, cor <dbl>, abs_cor <dbl>
# 0.35 instead of 0.3 otherwise the FA bootstrapping would freeze
.lcorrcutoff <- 0.35
low_correlating_features <- analyze_correlation(data_pureish_striphigh)$
cor_tibble_long %>%
filter(feat1 != feat2) %>%
group_by(feat1) %>%
summarize(max_cor = max(abs_cor)) %>%
filter(max_cor < .lcorrcutoff) %>%
pull(feat1)
feature_importances %>%
filter(Variable %in% low_correlating_features) %>%
pull(Variable)
## [1] "anaphoricrefs" "extrcaseexprs" "caserepcount.v"
## [4] "redundexprs" "relativisticexprs" "VERBcompdist.m"
## [7] "NOUNfrac.v" "abstractNOUNs"
data_pure <- data_pureish_striphigh %>%
select(!any_of(low_correlating_features))
colnames(data_pure) <- prettify_feat_name_vector(colnames(data_pure))
corrplot(cor(data_pure))
corrplot(abs(cor(data_pure)))
my_colors <- paletteer::paletteer_d("ggthemes::Classic_10_Medium")
network_edges <- analyze_correlation(data_pure)$cor_tibble_long_upper %>%
filter(abs_cor > .lcorrcutoff)
network <- graph_from_data_frame(
network_edges,
directed = FALSE
)
E(network)$weight <- network_edges$abs_cor
network_communities <- cluster_optimal(network)
network_membership <- membership(network_communities)
plot(
network,
layout = layout.fruchterman.reingold,
vertex.color = map(
network_communities$membership,
function(x) my_colors[x]
) %>% unlist(use.names = FALSE),
vertex.size = 6,
vertex.label.color = "black",
vertex.label.cex = 0.7
)
data_scaled <- data_pure %>%
mutate(across(seq_along(data_pure), ~ scale(.x)[, 1]))
mult.norm(data_scaled %>% as.data.frame())$mult.test
## Beta-hat kappa p-val
## Skewness 1072.732 134627.8036 0
## Kurtosis 2721.148 447.0895 0
mardia(data_scaled)
## Call: mardia(x = data_scaled)
##
## Mardia tests of multivariate skew and kurtosis
## Use describe(x) the to get univariate tests
## n.obs = 753 num.vars = 33
## b1p = 1072.73 skew = 134627.8 with probability <= 0
## small sample skew = 135195.8 with probability <= 0
## b2p = 2721.15 kurtosis = 447.09 with probability <= 0
Low (null) p-values show that we can reject the hypothesis that the data would be in a multivariate normal distribution. I.e. the distribution isn’t multivariate normal.
pdf("scree.pdf")
fa.parallel(data_scaled, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 8 and the number of components = NA
dev.off()
## png
## 2
set.seed(42)
fa_broad <- fa(
data_scaled,
nfactors = 8,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
## Loading required namespace: GPArotation
fa_broad
## Factor Analysis with confidence intervals using method = fa(r = data_scaled, nfactors = 8, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_scaled, nfactors = 8, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7 h2 u2
## sentlen.m -0.62 -0.02 -0.03 -0.28 0.00 0.37 0.15 -0.02 0.94 0.063
## sentcount 0.15 0.96 0.03 0.32 -0.07 -0.16 0.00 -0.01 0.93 0.066
## atl 0.70 0.00 -0.02 0.06 -0.05 -0.13 0.10 0.30 0.57 0.431
## activity 0.66 -0.01 0.10 0.47 0.00 0.31 -0.09 -0.09 0.89 0.106
## VERBfrac.m 0.80 -0.06 0.20 0.35 -0.02 0.10 -0.12 -0.05 0.90 0.100
## wordcount -0.15 0.95 0.00 0.01 0.02 0.00 -0.05 0.01 0.89 0.114
## entropy 0.03 0.72 0.07 -0.02 0.10 -0.04 -0.12 0.39 0.86 0.141
## sentlen.v 0.00 -0.01 0.73 0.28 0.01 -0.15 0.05 -0.02 0.46 0.538
## predsubjdist.m -0.08 -0.04 0.25 0.12 -0.04 0.06 0.55 -0.04 0.45 0.555
## compoundVERBs 0.99 -0.15 0.30 -0.31 0.07 -0.18 -0.14 -0.04 0.70 0.298
## passives 0.03 -0.09 -0.03 -0.79 0.15 -0.25 -0.06 -0.09 0.57 0.427
## predobjdist.m 0.08 -0.12 0.60 0.01 -0.05 -0.08 0.29 0.00 0.42 0.583
## literary 0.00 -0.04 0.07 -0.34 0.15 0.14 -0.05 0.06 0.24 0.758
## verbdist -0.74 0.00 0.00 -0.12 -0.06 -0.25 0.26 -0.04 0.81 0.188
## maentropy -0.19 -0.07 -0.15 -0.03 0.12 -0.01 -0.01 0.82 0.76 0.245
## predorder.m -0.45 -0.07 0.06 0.06 -0.04 0.19 0.51 0.07 0.70 0.297
## hapaxes 0.10 -0.83 0.07 0.07 0.01 -0.10 0.01 0.29 0.72 0.282
## VERBcomp 0.56 0.02 -0.01 0.15 -0.15 0.54 -0.01 0.04 0.60 0.404
## NOUNcount.v -0.33 -0.04 0.43 -0.08 -0.05 0.01 -0.22 -0.03 0.41 0.594
## subj 0.69 0.12 -0.14 -0.04 0.11 -0.02 0.13 -0.14 0.58 0.422
## NOUNcount.m -0.84 0.05 0.01 -0.08 -0.17 -0.10 0.14 0.07 0.79 0.209
## predobjdist.v 0.05 0.14 0.51 -0.07 0.07 0.04 0.07 0.02 0.39 0.606
## NEGcount.m 0.04 -0.05 -0.06 0.08 1.00 0.08 0.03 0.09 0.94 0.063
## compoundVERBsdist.m 0.13 -0.02 0.71 -0.14 -0.08 -0.04 -0.03 -0.14 0.43 0.566
## VERBfrac.v -0.55 -0.03 0.15 0.23 -0.04 -0.21 -0.06 0.06 0.35 0.648
## NEGcount.v 0.21 0.09 0.01 -0.03 0.75 0.02 -0.11 0.07 0.59 0.415
## compoundVERBsdist.v -0.07 0.23 0.28 -0.20 0.04 0.00 0.06 -0.03 0.33 0.672
## predsubjdist.v -0.14 0.10 0.38 -0.03 0.10 0.13 0.17 0.03 0.47 0.533
## mamr 0.84 -0.07 -0.06 0.02 0.01 0.02 0.16 -0.17 0.77 0.234
## obj 0.08 -0.03 -0.06 0.00 0.08 0.83 0.10 -0.02 0.68 0.322
## predorder.v -0.05 -0.02 0.52 -0.05 0.07 0.16 0.17 0.08 0.54 0.463
## verbalNOUNs 0.23 0.05 -0.02 -0.12 -0.14 -0.18 0.00 0.04 0.14 0.862
## NEGfrac.m -0.03 -0.02 -0.03 0.60 0.29 -0.21 0.09 -0.09 0.40 0.602
## com
## sentlen.m 2.2
## sentcount 1.3
## atl 1.5
## activity 2.4
## VERBfrac.m 1.6
## wordcount 1.1
## entropy 1.7
## sentlen.v 1.4
## predsubjdist.m 1.6
## compoundVERBs 1.6
## passives 1.4
## predobjdist.m 1.6
## literary 2.0
## verbdist 1.6
## maentropy 1.2
## predorder.m 2.4
## hapaxes 1.3
## VERBcomp 2.3
## NOUNcount.v 2.6
## subj 1.4
## NOUNcount.m 1.2
## predobjdist.v 1.3
## NEGcount.m 1.1
## compoundVERBsdist.m 1.3
## VERBfrac.v 1.9
## NEGcount.v 1.3
## compoundVERBsdist.v 3.1
## predsubjdist.v 2.4
## mamr 1.2
## obj 1.1
## predorder.v 1.6
## verbalNOUNs 3.4
## NEGfrac.m 1.9
##
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7
## SS loadings 6.71 3.10 2.53 2.08 1.74 1.56 1.29 1.19
## Proportion Var 0.20 0.09 0.08 0.06 0.05 0.05 0.04 0.04
## Cumulative Var 0.20 0.30 0.37 0.44 0.49 0.54 0.58 0.61
## Proportion Explained 0.33 0.15 0.13 0.10 0.09 0.08 0.06 0.06
## Cumulative Proportion 0.33 0.49 0.61 0.71 0.80 0.88 0.94 1.00
##
## With factor correlations of
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7
## PA1 1.00 0.11 -0.56 0.38 -0.37 -0.18 -0.36 -0.17
## PA2 0.11 1.00 0.17 -0.26 0.27 0.25 0.01 0.18
## PA3 -0.56 0.17 1.00 -0.33 0.30 0.32 0.24 0.11
## PA5 0.38 -0.26 -0.33 1.00 -0.34 -0.23 -0.38 -0.17
## PA6 -0.37 0.27 0.30 -0.34 1.00 0.32 0.11 0.07
## PA4 -0.18 0.25 0.32 -0.23 0.32 1.00 0.00 0.08
## PA8 -0.36 0.01 0.24 -0.38 0.11 0.00 1.00 -0.10
## PA7 -0.17 0.18 0.11 -0.17 0.07 0.08 -0.10 1.00
##
## Mean item complexity = 1.7
## Test of the hypothesis that 8 factors are sufficient.
##
## df null model = 528 with the objective function = 24.21 with Chi Square = 17922.49
## df of the model are 292 and the objective function was 2.94
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 753 with the empirical chi square 514.88 with prob < 1.6e-14
## The total n.obs was 753 with Likelihood Chi Square = 2157.52 with prob < 2.7e-281
##
## Tucker Lewis Index of factoring reliability = 0.805
## RMSEA index = 0.092 and the 90 % confidence intervals are 0.089 0.096
## BIC = 223.3
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## PA1 PA2 PA3 PA5 PA6 PA4
## Correlation of (regression) scores with factors 0.98 0.98 0.92 0.94 0.98 0.94
## Multiple R square of scores with factors 0.96 0.96 0.85 0.89 0.96 0.89
## Minimum correlation of possible factor scores 0.92 0.92 0.70 0.77 0.91 0.78
## PA8 PA7
## Correlation of (regression) scores with factors 0.87 0.91
## Multiple R square of scores with factors 0.75 0.82
## Minimum correlation of possible factor scores 0.50 0.65
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA3 upper low
## sentlen.m -0.80 -0.62 -0.44 -0.06 -0.02 0.01 -0.08 -0.03 0.04 -0.33
## sentcount 0.10 0.15 0.21 0.90 0.96 1.02 -0.01 0.03 0.07 0.24
## atl 0.47 0.70 0.84 -0.06 0.00 0.08 -0.11 -0.02 0.08 -0.06
## activity 0.48 0.66 0.88 -0.05 -0.01 0.02 0.04 0.10 0.15 0.39
## VERBfrac.m 0.56 0.80 1.06 -0.09 -0.06 -0.01 0.11 0.20 0.27 0.26
## wordcount -0.19 -0.15 -0.08 0.90 0.95 0.99 -0.04 0.00 0.04 -0.03
## entropy -0.05 0.03 0.08 0.67 0.72 0.77 0.02 0.07 0.12 -0.06
## sentlen.v -0.10 0.00 0.08 -0.07 -0.01 0.06 0.55 0.73 0.96 0.20
## predsubjdist.m -0.35 -0.08 0.07 -0.09 -0.04 0.02 0.12 0.25 0.43 -0.03
## compoundVERBs 0.69 0.99 1.31 -0.21 -0.15 -0.09 0.19 0.30 0.41 -0.38
## passives -0.03 0.03 0.09 -0.14 -0.09 -0.04 -0.11 -0.03 0.03 -0.86
## predobjdist.m -0.06 0.08 0.18 -0.18 -0.12 -0.05 0.42 0.60 0.85 -0.14
## literary -0.12 0.00 0.13 -0.11 -0.04 0.03 -0.04 0.07 0.16 -0.43
## verbdist -1.00 -0.74 -0.53 -0.04 0.00 0.03 -0.04 0.00 0.06 -0.27
## maentropy -0.34 -0.19 -0.13 -0.10 -0.07 -0.01 -0.23 -0.15 -0.10 -0.12
## predorder.m -0.76 -0.45 -0.28 -0.11 -0.07 -0.01 -0.03 0.06 0.19 -0.11
## hapaxes 0.01 0.10 0.16 -0.89 -0.83 -0.77 0.00 0.07 0.13 0.00
## VERBcomp 0.39 0.56 0.73 -0.03 0.02 0.08 -0.08 -0.01 0.05 0.08
## NOUNcount.v -0.40 -0.33 -0.16 -0.12 -0.04 0.03 0.29 0.43 0.59 -0.14
## subj 0.49 0.69 0.85 0.06 0.12 0.18 -0.20 -0.14 -0.08 -0.15
## NOUNcount.m -1.12 -0.84 -0.59 -0.01 0.05 0.10 -0.05 0.01 0.10 -0.15
## predobjdist.v -0.10 0.05 0.18 0.05 0.14 0.25 0.31 0.51 0.73 -0.18
## NEGcount.m -0.05 0.04 0.08 -0.08 -0.05 -0.01 -0.12 -0.06 -0.01 -0.01
## compoundVERBsdist.m 0.00 0.13 0.28 -0.09 -0.02 0.06 0.52 0.71 0.95 -0.21
## VERBfrac.v -0.73 -0.55 -0.37 -0.10 -0.03 0.05 0.03 0.15 0.26 0.13
## NEGcount.v 0.14 0.21 0.32 0.03 0.09 0.14 -0.05 0.01 0.06 -0.10
## compoundVERBsdist.v -0.19 -0.07 0.02 0.16 0.23 0.32 0.16 0.28 0.43 -0.29
## predsubjdist.v -0.32 -0.14 -0.03 0.03 0.10 0.18 0.23 0.38 0.58 -0.13
## mamr 0.60 0.84 1.03 -0.12 -0.07 -0.01 -0.12 -0.06 0.02 -0.08
## obj 0.01 0.08 0.15 -0.07 -0.03 0.02 -0.11 -0.06 0.00 -0.05
## predorder.v -0.24 -0.05 0.09 -0.10 -0.02 0.06 0.32 0.52 0.77 -0.16
## verbalNOUNs 0.11 0.23 0.32 -0.02 0.05 0.14 -0.14 -0.02 0.08 -0.22
## NEGfrac.m -0.16 -0.03 0.06 -0.09 -0.02 0.05 -0.11 -0.03 0.05 0.48
## PA5 upper low PA6 upper low PA4 upper low PA8
## sentlen.m -0.28 -0.22 -0.03 0.00 0.06 0.26 0.37 0.49 -0.22 0.15
## sentcount 0.32 0.35 -0.12 -0.07 -0.04 -0.22 -0.16 -0.12 -0.32 0.00
## atl 0.06 0.11 -0.16 -0.05 0.03 -0.27 -0.13 -0.02 -0.61 0.10
## activity 0.47 0.54 -0.05 0.00 0.04 0.21 0.31 0.43 -0.31 -0.09
## VERBfrac.m 0.35 0.43 -0.07 -0.02 0.03 0.04 0.10 0.17 -0.54 -0.12
## wordcount 0.01 0.06 -0.01 0.02 0.07 -0.04 0.00 0.04 -0.19 -0.05
## entropy -0.02 0.02 0.07 0.10 0.15 -0.10 -0.04 0.00 -0.72 -0.12
## sentlen.v 0.28 0.34 -0.06 0.01 0.07 -0.21 -0.15 -0.09 -0.11 0.05
## predsubjdist.m 0.12 0.22 -0.14 -0.04 0.05 -0.07 0.06 0.21 0.06 0.55
## compoundVERBs -0.31 -0.21 0.02 0.07 0.13 -0.27 -0.18 -0.10 -0.63 -0.14
## passives -0.79 -0.66 0.09 0.15 0.21 -0.36 -0.25 -0.16 -0.19 -0.06
## predobjdist.m 0.01 0.12 -0.17 -0.05 0.04 -0.17 -0.08 0.02 -0.18 0.29
## literary -0.34 -0.23 0.07 0.15 0.25 0.05 0.14 0.25 -0.20 -0.05
## verbdist -0.12 -0.01 -0.11 -0.06 -0.02 -0.34 -0.25 -0.18 -0.01 0.26
## maentropy -0.03 0.02 0.07 0.12 0.18 -0.08 -0.01 0.04 -0.87 -0.01
## predorder.m 0.06 0.12 -0.16 -0.04 0.06 0.04 0.19 0.32 0.10 0.51
## hapaxes 0.07 0.12 -0.05 0.01 0.05 -0.15 -0.10 -0.05 -0.31 0.01
## VERBcomp 0.15 0.21 -0.22 -0.15 -0.07 0.38 0.54 0.76 -0.22 -0.01
## NOUNcount.v -0.08 0.06 -0.13 -0.05 0.06 -0.07 0.01 0.11 -0.54 -0.22
## subj -0.04 0.02 0.02 0.11 0.17 -0.09 -0.02 0.05 -0.20 0.13
## NOUNcount.m -0.08 -0.01 -0.26 -0.17 -0.10 -0.18 -0.10 -0.02 -0.25 0.14
## predobjdist.v -0.07 0.03 -0.01 0.07 0.16 -0.05 0.04 0.15 -0.26 0.07
## NEGcount.m 0.08 0.11 0.85 1.00 1.10 0.04 0.08 0.17 -0.18 0.03
## compoundVERBsdist.m -0.14 -0.04 -0.15 -0.08 -0.01 -0.11 -0.04 0.02 -0.35 -0.03
## VERBfrac.v 0.23 0.34 -0.12 -0.04 0.07 -0.34 -0.21 -0.11 -0.37 -0.06
## NEGcount.v -0.03 0.06 0.67 0.75 0.92 -0.05 0.02 0.09 -0.42 -0.11
## compoundVERBsdist.v -0.20 -0.10 -0.04 0.04 0.12 -0.08 0.00 0.10 -0.13 0.06
## predsubjdist.v -0.03 0.07 -0.01 0.10 0.19 0.04 0.13 0.22 -0.13 0.17
## mamr 0.02 0.06 -0.08 0.01 0.07 -0.04 0.02 0.09 -0.12 0.16
## obj 0.00 0.05 0.03 0.08 0.16 0.59 0.83 1.14 -0.17 0.10
## predorder.v -0.05 0.03 -0.02 0.07 0.14 0.06 0.16 0.28 -0.11 0.17
## verbalNOUNs -0.12 -0.03 -0.27 -0.14 -0.04 -0.32 -0.18 -0.07 -0.22 0.00
## NEGfrac.m 0.60 0.67 0.18 0.29 0.38 -0.33 -0.21 -0.12 -0.20 0.09
## upper low PA7 upper
## sentlen.m 0.78 -0.06 -0.02 0.04
## sentcount 0.18 -0.07 -0.01 0.02
## atl 0.49 0.15 0.30 0.43
## activity 0.04 -0.18 -0.09 -0.03
## VERBfrac.m 0.14 -0.12 -0.05 0.00
## wordcount 0.06 -0.04 0.01 0.08
## entropy 0.24 0.16 0.39 0.76
## sentlen.v 0.34 -0.12 -0.02 0.07
## predsubjdist.m 1.45 -0.43 -0.04 0.24
## compoundVERBs 0.15 -0.12 -0.04 0.05
## passives 0.10 -0.16 -0.09 -0.02
## predobjdist.m 1.01 -0.16 0.00 0.10
## literary 0.14 -0.04 0.06 0.19
## verbdist 0.74 -0.13 -0.04 0.00
## maentropy 0.49 0.35 0.82 1.47
## predorder.m 1.09 -0.10 0.07 0.11
## hapaxes 0.18 0.10 0.29 0.53
## VERBcomp 0.13 -0.04 0.04 0.11
## NOUNcount.v 0.24 -0.14 -0.03 0.18
## subj 0.33 -0.40 -0.14 -0.02
## NOUNcount.m 0.73 -0.02 0.07 0.17
## predobjdist.v 0.51 -0.10 0.02 0.13
## NEGcount.m 0.15 0.01 0.09 0.18
## compoundVERBsdist.m 0.44 -0.26 -0.14 -0.04
## VERBfrac.v 0.34 -0.06 0.06 0.23
## NEGcount.v 0.10 -0.01 0.07 0.20
## compoundVERBsdist.v 0.33 -0.17 -0.03 0.09
## predsubjdist.v 0.60 -0.12 0.03 0.13
## mamr 0.30 -0.46 -0.17 -0.03
## obj 0.51 -0.10 -0.02 0.05
## predorder.v 0.55 -0.06 0.08 0.21
## verbalNOUNs 0.17 -0.13 0.04 0.18
## NEGfrac.m 0.32 -0.25 -0.09 0.00
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 -0.1279 0.1108 0.33
## PA1-PA3 -0.8579 -0.5622 -0.16
## PA1-PA5 -0.7884 0.3830 0.26
## PA1-PA6 -0.7771 -0.3665 0.23
## PA1-PA4 -0.6213 -0.1818 0.13
## PA1-PA8 -0.5916 -0.3611 0.20
## PA1-PA7 -0.4538 -0.1660 0.19
## PA2-PA3 -0.0072 0.1702 0.33
## PA2-PA5 -0.2310 -0.2586 0.57
## PA2-PA6 -0.2346 0.2683 0.51
## PA2-PA4 -0.0981 0.2463 0.44
## PA2-PA8 -0.1620 0.0064 0.41
## PA2-PA7 -0.1381 0.1785 0.32
## PA3-PA5 -0.2780 -0.3255 0.71
## PA3-PA6 -0.2549 0.3000 0.71
## PA3-PA4 -0.0563 0.3241 0.60
## PA3-PA8 -0.1515 0.2427 0.55
## PA3-PA7 -0.2317 0.1085 0.43
## PA5-PA6 -0.4503 -0.3378 0.70
## PA5-PA4 -0.2834 -0.2304 0.59
## PA5-PA8 -0.2293 -0.3838 0.46
## PA5-PA7 -0.2434 -0.1659 0.34
## PA6-PA4 -0.2287 0.3221 0.53
## PA6-PA8 -0.2366 0.1114 0.38
## PA6-PA7 -0.2273 0.0710 0.30
## PA4-PA8 -0.2312 -0.0029 0.41
## PA4-PA7 -0.2247 0.0752 0.28
## PA8-PA7 -0.3293 -0.1047 0.32
fa_broad$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_scaled)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 33 × 2
## feat maxload
## <chr> <dbl>
## 1 verbalNOUNs 0.232
## 2 compoundVERBsdist.v 0.281
## 3 literary 0.343
## 4 predsubjdist.v 0.377
## 5 NOUNcount.v 0.431
## 6 predobjdist.v 0.509
## 7 predorder.m 0.515
## 8 predorder.v 0.519
## 9 VERBfrac.v 0.549
## 10 predsubjdist.m 0.551
## # ℹ 23 more rows
fa_broad$communality %>% sort()
## verbalNOUNs literary compoundVERBsdist.v VERBfrac.v
## 0.1379713 0.2423431 0.3280540 0.3524992
## predobjdist.v NEGfrac.m NOUNcount.v predobjdist.m
## 0.3939045 0.3975483 0.4064061 0.4169727
## compoundVERBsdist.m predsubjdist.m sentlen.v predsubjdist.v
## 0.4336188 0.4453023 0.4615505 0.4669617
## predorder.v atl passives subj
## 0.5370148 0.5694476 0.5733804 0.5775257
## NEGcount.v VERBcomp obj compoundVERBs
## 0.5854885 0.5958715 0.6784960 0.7020210
## predorder.m hapaxes maentropy mamr
## 0.7030408 0.7184036 0.7553256 0.7664031
## NOUNcount.m verbdist entropy wordcount
## 0.7910351 0.8118113 0.8591141 0.8864995
## activity VERBfrac.m sentcount sentlen.m
## 0.8937370 0.8998234 0.9344065 0.9365817
## NEGcount.m
## 0.9365996
fa_broad$communality[fa_broad$communality < 0.5] %>% names()
## [1] "sentlen.v" "predsubjdist.m" "predobjdist.m"
## [4] "literary" "NOUNcount.v" "predobjdist.v"
## [7] "compoundVERBsdist.m" "VERBfrac.v" "compoundVERBsdist.v"
## [10] "predsubjdist.v" "verbalNOUNs" "NEGfrac.m"
fa_broad$complexity %>% sort()
## wordcount NEGcount.m obj mamr
## 1.058480 1.059835 1.079227 1.183128
## NOUNcount.m maentropy NEGcount.v compoundVERBsdist.m
## 1.203656 1.249629 1.261795 1.268893
## predobjdist.v hapaxes sentcount passives
## 1.333335 1.333578 1.346796 1.350058
## subj sentlen.v atl predorder.v
## 1.372625 1.381042 1.509559 1.551827
## verbdist compoundVERBs VERBfrac.m predobjdist.m
## 1.558892 1.579530 1.616498 1.633887
## predsubjdist.m entropy NEGfrac.m VERBfrac.v
## 1.647062 1.696694 1.871425 1.926064
## literary sentlen.m VERBcomp predsubjdist.v
## 1.976897 2.244205 2.308159 2.404788
## predorder.m activity NOUNcount.v compoundVERBsdist.v
## 2.412118 2.434222 2.574050 3.113858
## verbalNOUNs
## 3.371824
fa_broad$complexity[fa_broad$complexity > 2] %>% names()
## [1] "sentlen.m" "activity" "predorder.m"
## [4] "VERBcomp" "NOUNcount.v" "compoundVERBsdist.v"
## [7] "predsubjdist.v" "verbalNOUNs"
Comrey and Lee (1992): loadings excelent > .70 > very good > .63 > good > .55 > fair > .45 > poor > .32
fa.diagram(fa_broad)
fa_broad$loadings
##
## Loadings:
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7
## sentlen.m -0.619 -0.283 0.366 0.147
## sentcount 0.152 0.961 0.317 -0.161
## atl 0.695 -0.127 0.103 0.297
## activity 0.661 0.473 0.306
## VERBfrac.m 0.798 0.196 0.346 0.100 -0.120
## wordcount -0.150 0.946
## entropy 0.717 0.102 -0.120 0.390
## sentlen.v 0.731 0.275 -0.147
## predsubjdist.m 0.254 0.122 0.551
## compoundVERBs 0.992 -0.154 0.296 -0.308 -0.177 -0.142
## passives -0.790 0.146 -0.248
## predobjdist.m -0.116 0.598 0.289
## literary -0.343 0.149 0.136
## verbdist -0.741 -0.118 -0.246 0.258
## maentropy -0.190 -0.154 0.125 0.819
## predorder.m -0.452 0.188 0.515
## hapaxes 0.103 -0.829 0.286
## VERBcomp 0.555 0.145 -0.151 0.538
## NOUNcount.v -0.326 0.431 -0.222
## subj 0.693 0.118 -0.143 0.105 0.131 -0.140
## NOUNcount.m -0.839 -0.168 0.139
## predobjdist.v 0.144 0.509
## NEGcount.m 0.997
## compoundVERBsdist.m 0.128 0.714 -0.139 -0.142
## VERBfrac.v -0.549 0.150 0.229 -0.213
## NEGcount.v 0.213 0.751 -0.111
## compoundVERBsdist.v 0.231 0.281 -0.196
## predsubjdist.v -0.144 0.377 0.129 0.174
## mamr 0.838 0.157 -0.171
## obj 0.828
## predorder.v 0.519 0.160 0.165
## verbalNOUNs 0.232 -0.118 -0.140 -0.176
## NEGfrac.m 0.598 0.295 -0.214
##
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7
## SS loadings 6.541 3.204 2.638 2.011 1.855 1.689 1.032 1.136
## Proportion Var 0.198 0.097 0.080 0.061 0.056 0.051 0.031 0.034
## Cumulative Var 0.198 0.295 0.375 0.436 0.492 0.544 0.575 0.609
for (i in 1:fa_broad$factors) {
cat("\n-----", colnames(fa_broad$loadings)[i], "-----\n")
loadings <- fa_broad$loadings[, i]
load_df <- data.frame(loading = loadings)
load_df_filtered <- load_df %>%
mutate(abs_l = abs(loading)) %>%
mutate(strng = case_when(
abs_l > 0.70 ~ "*****",
abs_l <= 0.70 & abs_l > 0.63 ~ "**** ",
abs_l <= 0.63 & abs_l > 0.55 ~ "*** ",
abs_l <= 0.55 & abs_l > 0.45 ~ "** ",
abs_l <= 0.45 & abs_l > 0.32 ~ "* ",
.default = ""
)) %>%
arrange(-abs_l) %>%
filter(abs_l > 0.1)
load_df_filtered %>%
mutate(across(c(loading, abs_l), ~ round(.x, 3))) %>%
print()
cat("\n")
}
##
## ----- PA1 -----
## loading abs_l strng
## compoundVERBs 0.992 0.992 *****
## NOUNcount.m -0.839 0.839 *****
## mamr 0.838 0.838 *****
## VERBfrac.m 0.798 0.798 *****
## verbdist -0.741 0.741 *****
## atl 0.695 0.695 ****
## subj 0.693 0.693 ****
## activity 0.661 0.661 ****
## sentlen.m -0.619 0.619 ***
## VERBcomp 0.555 0.555 ***
## VERBfrac.v -0.549 0.549 **
## predorder.m -0.452 0.452 **
## NOUNcount.v -0.326 0.326 *
## verbalNOUNs 0.232 0.232
## NEGcount.v 0.213 0.213
## maentropy -0.190 0.190
## sentcount 0.152 0.152
## wordcount -0.150 0.150
## predsubjdist.v -0.144 0.144
## compoundVERBsdist.m 0.128 0.128
## hapaxes 0.103 0.103
##
##
## ----- PA2 -----
## loading abs_l strng
## sentcount 0.961 0.961 *****
## wordcount 0.946 0.946 *****
## hapaxes -0.829 0.829 *****
## entropy 0.717 0.717 *****
## compoundVERBsdist.v 0.231 0.231
## compoundVERBs -0.154 0.154
## predobjdist.v 0.144 0.144
## subj 0.118 0.118
## predobjdist.m -0.116 0.116
##
##
## ----- PA3 -----
## loading abs_l strng
## sentlen.v 0.731 0.731 *****
## compoundVERBsdist.m 0.714 0.714 *****
## predobjdist.m 0.598 0.598 ***
## predorder.v 0.519 0.519 **
## predobjdist.v 0.509 0.509 **
## NOUNcount.v 0.431 0.431 *
## predsubjdist.v 0.377 0.377 *
## compoundVERBs 0.296 0.296
## compoundVERBsdist.v 0.281 0.281
## predsubjdist.m 0.254 0.254
## VERBfrac.m 0.196 0.196
## maentropy -0.154 0.154
## VERBfrac.v 0.150 0.150
## subj -0.143 0.143
##
##
## ----- PA5 -----
## loading abs_l strng
## passives -0.790 0.790 *****
## NEGfrac.m 0.598 0.598 ***
## activity 0.473 0.473 **
## VERBfrac.m 0.346 0.346 *
## literary -0.343 0.343 *
## sentcount 0.317 0.317
## compoundVERBs -0.308 0.308
## sentlen.m -0.283 0.283
## sentlen.v 0.275 0.275
## VERBfrac.v 0.229 0.229
## compoundVERBsdist.v -0.196 0.196
## VERBcomp 0.145 0.145
## compoundVERBsdist.m -0.139 0.139
## predsubjdist.m 0.122 0.122
## verbdist -0.118 0.118
## verbalNOUNs -0.118 0.118
##
##
## ----- PA6 -----
## loading abs_l strng
## NEGcount.m 0.997 0.997 *****
## NEGcount.v 0.751 0.751 *****
## NEGfrac.m 0.295 0.295
## NOUNcount.m -0.168 0.168
## VERBcomp -0.151 0.151
## literary 0.149 0.149
## passives 0.146 0.146
## verbalNOUNs -0.140 0.140
## maentropy 0.125 0.125
## subj 0.105 0.105
## entropy 0.102 0.102
##
##
## ----- PA4 -----
## loading abs_l strng
## obj 0.828 0.828 *****
## VERBcomp 0.538 0.538 **
## sentlen.m 0.366 0.366 *
## activity 0.306 0.306
## passives -0.248 0.248
## verbdist -0.246 0.246
## NEGfrac.m -0.214 0.214
## VERBfrac.v -0.213 0.213
## predorder.m 0.188 0.188
## compoundVERBs -0.177 0.177
## verbalNOUNs -0.176 0.176
## sentcount -0.161 0.161
## predorder.v 0.160 0.160
## sentlen.v -0.147 0.147
## literary 0.136 0.136
## predsubjdist.v 0.129 0.129
## atl -0.127 0.127
## VERBfrac.m 0.100 0.100
##
##
## ----- PA8 -----
## loading abs_l strng
## predsubjdist.m 0.551 0.551 ***
## predorder.m 0.515 0.515 **
## predobjdist.m 0.289 0.289
## verbdist 0.258 0.258
## NOUNcount.v -0.222 0.222
## predsubjdist.v 0.174 0.174
## predorder.v 0.165 0.165
## mamr 0.157 0.157
## sentlen.m 0.147 0.147
## compoundVERBs -0.142 0.142
## NOUNcount.m 0.139 0.139
## subj 0.131 0.131
## VERBfrac.m -0.120 0.120
## entropy -0.120 0.120
## NEGcount.v -0.111 0.111
## atl 0.103 0.103
##
##
## ----- PA7 -----
## loading abs_l strng
## maentropy 0.819 0.819 *****
## entropy 0.390 0.390 *
## atl 0.297 0.297
## hapaxes 0.286 0.286
## mamr -0.171 0.171
## compoundVERBsdist.m -0.142 0.142
## subj -0.140 0.140
hypotheses:
fa_broad$uniquenesses %>% round(3)
## sentlen.m sentcount atl activity
## 0.063 0.066 0.431 0.106
## VERBfrac.m wordcount entropy sentlen.v
## 0.100 0.114 0.141 0.538
## predsubjdist.m compoundVERBs passives predobjdist.m
## 0.555 0.298 0.427 0.583
## literary verbdist maentropy predorder.m
## 0.758 0.188 0.245 0.297
## hapaxes VERBcomp NOUNcount.v subj
## 0.282 0.404 0.594 0.422
## NOUNcount.m predobjdist.v NEGcount.m compoundVERBsdist.m
## 0.209 0.606 0.063 0.566
## VERBfrac.v NEGcount.v compoundVERBsdist.v predsubjdist.v
## 0.648 0.415 0.672 0.533
## mamr obj predorder.v verbalNOUNs
## 0.234 0.322 0.463 0.862
## NEGfrac.m
## 0.602
broad_data <- data_factor_bind(data_clean, fa_broad)
broad_data$long %>%
group_by(factor) %>%
summarize(shapiro = shapiro.test(factor_score)$p.value)
## # A tibble: 8 × 2
## factor shapiro
## <fct> <dbl>
## 1 PA1 1.41e- 8
## 2 PA2 3.52e-13
## 3 PA3 4.05e-32
## 4 PA5 1.73e- 2
## 5 PA6 7.21e-12
## 6 PA4 1.50e-12
## 7 PA8 1.34e-34
## 8 PA7 4.28e- 7
broad_data$long %>%
ggplot(aes(x = factor_score, y = class)) +
facet_grid(factor ~ .) +
theme(legend.position = "bottom") +
geom_jitter(width = 0, height = 0.1, alpha = 0.2)
analyze_distributions(broad_data$long, "class")
##
## bad good
## 414 339
## Saving 7 x 5 in image
##
## Test for the significance of differences in class over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 134.1647, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | -11.58295
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.178 (95% CI: 0.129 - 0.233 )
##
## Test for the significance of differences in class over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 1.5495, df = 1, p-value = 0.21
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | 1.244788
## | 0.2132
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00206 (95% CI: 4.87e-06 - 0.0137 )
##
## Test for the significance of differences in class over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 8.5251, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | 2.919772
## | 0.0035*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0113 (95% CI: 0.00128 - 0.0317 )
##
## Test for the significance of differences in class over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 111.8462, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | -10.57573
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.149 (95% CI: 0.102 - 0.201 )
##
## Test for the significance of differences in class over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 35.0328, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | 5.918850
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0466 (95% CI: 0.0214 - 0.0831 )
##
## Test for the significance of differences in class over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 1.9676, df = 1, p-value = 0.16
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | 1.402723
## | 0.1607
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00262 (95% CI: 8.95e-06 - 0.0136 )
##
## Test for the significance of differences in class over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 1.7297, df = 1, p-value = 0.19
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | -1.315169
## | 0.1885
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0023 (95% CI: 8.04e-06 - 0.0139 )
##
## Test for the significance of differences in class over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 25.6664, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bad
## ---------+-----------
## good | 5.066204
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0341 (95% CI: 0.0118 - 0.0651 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 134.16 < .0001 0.129 0.178 0.233
## 2 PA2 1.55 0.21 0.000 0.002 0.014
## 3 PA3 8.53 < .01 0.001 0.011 0.032
## 4 PA5 111.85 < .0001 0.102 0.149 0.201
## 5 PA6 35.03 < .0001 0.021 0.047 0.083
## 6 PA4 1.97 0.16 0.000 0.003 0.014
## 7 PA8 1.73 0.19 0.000 0.002 0.014
## 8 PA7 25.67 < .0001 0.012 0.034 0.065
##
## p < 5e-2 found in: PA1 PA3 PA5 PA6 PA7
## p < 1e-2 found in: PA1 PA3 PA5 PA6 PA7
## p < 1e-3 found in: PA1 PA5 PA6 PA7
## p < 1e-4 found in: PA1 PA5 PA6 PA7
analyze_distributions(broad_data$long, "subcorpus")
##
## CzCDC FrBo KUKY LiFRLaw OmbuFlyers
## 211 307 194 3 38
## Saving 7 x 5 in image
##
## Test for the significance of differences in subcorpus over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 395.852, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | -18.96883
## | 0.0000*
## |
## KUKY | -5.099316 12.96436
## | 0.0000* 0.0000*
## |
## LiFRLaw | -1.520822 1.399609 -0.648070
## | 1.0000 1.0000 1.0000
## |
## OmbuFlye | -5.887897 3.830227 -2.989708 -0.255667
## | 0.0000* 0.0013* 0.0279* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.526 (95% CI: 0.481 - 0.572 )
##
## Test for the significance of differences in subcorpus over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 5.8651, df = 4, p-value = 0.21
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | 0.033912
## | 1.0000
## |
## KUKY | 1.604396 1.706931
## | 1.0000 0.8783
## |
## LiFRLaw | 1.270076 1.267642 0.994997
## | 1.0000 1.0000 1.0000
## |
## OmbuFlye | -0.631636 -0.664904 -1.527047 -1.416996
## | 1.0000 1.0000 1.0000 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0078 (95% CI: 0.0026 - 0.0302 )
##
## Test for the significance of differences in subcorpus over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 32.2648, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | 3.399903
## | 0.0067*
## |
## KUKY | -1.106002 -4.514392
## | 1.0000 0.0001*
## |
## LiFRLaw | 2.014068 1.494450 2.201923
## | 0.4400 1.0000 0.2767
## |
## OmbuFlye | -1.595793 -3.403249 -0.965088 -2.421644
## | 1.0000 0.0067* 1.0000 0.1545
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0429 (95% CI: 0.0205 - 0.078 )
##
## Test for the significance of differences in subcorpus over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 158.8361, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | -11.25575
## | 0.0000*
## |
## KUKY | -9.935614 0.199036
## | 0.0000* 1.0000
## |
## LiFRLaw | 0.906812 2.643720 2.604923
## | 1.0000 0.0820 0.0919
## |
## OmbuFlye | -6.267907 -0.570014 -0.655468 -2.721021
## | 0.0000* 1.0000 1.0000 0.0651
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.211 (95% CI: 0.161 - 0.268 )
##
## Test for the significance of differences in subcorpus over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 93.6579, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | 9.518951
## | 0.0000*
## |
## KUKY | 4.103058 -4.831160
## | 0.0004* 0.0000*
## |
## LiFRLaw | 2.074774 0.612126 1.372011
## | 0.3801 1.0000 1.0000
## |
## OmbuFlye | 3.100564 -1.772579 0.779434 -1.100472
## | 0.0193* 0.7630 1.0000 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.125 (95% CI: 0.0853 - 0.175 )
##
## Test for the significance of differences in subcorpus over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 52.4123, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | 6.340314
## | 0.0000*
## |
## KUKY | 3.787715 -2.073977
## | 0.0015* 0.3808
## |
## LiFRLaw | 0.760979 -0.214616 0.112935
## | 1.0000 1.0000 1.0000
## |
## OmbuFlye | 5.237906 2.070563 3.079441 0.801364
## | 0.0000* 0.3840 0.0207* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0697 (95% CI: 0.042 - 0.113 )
##
## Test for the significance of differences in subcorpus over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 26.9652, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | 1.444019
## | 1.0000
## |
## KUKY | 1.858296 0.607432
## | 0.6313 1.0000
## |
## LiFRLaw | 0.448053 0.226464 0.130070
## | 1.0000 1.0000 1.0000
## |
## OmbuFlye | 5.151007 4.527602 4.074951 1.079222
## | 0.0000* 0.0001* 0.0005* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0359 (95% CI: 0.0183 - 0.0652 )
##
## Test for the significance of differences in subcorpus over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 9.544, df = 4, p-value = 0.05
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY LiFRLaw
## ---------+--------------------------------------------
## FrBo | 1.582285
## | 1.0000
## |
## KUKY | 0.671085 -0.814927
## | 1.0000 1.0000
## |
## LiFRLaw | -0.949506 -1.195481 -1.063657
## | 1.0000 1.0000 1.0000
## |
## OmbuFlye | -1.842386 -2.710779 -2.206470 0.379197
## | 0.6542 0.0671 0.2735 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0127 (95% CI: 0.00364 - 0.0369 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 395.85 < .0001 0.481 0.526 0.572
## 2 PA2 5.87 0.21 0.003 0.008 0.030
## 3 PA3 32.26 < .0001 0.020 0.043 0.078
## 4 PA5 158.84 < .0001 0.161 0.211 0.268
## 5 PA6 93.66 < .0001 0.085 0.125 0.175
## 6 PA4 52.41 < .0001 0.042 0.070 0.113
## 7 PA8 26.97 < .0001 0.018 0.036 0.065
## 8 PA7 9.54 < .05 0.004 0.013 0.037
##
## p < 5e-2 found in: PA1 PA3 PA5 PA6 PA4 PA8
## p < 1e-2 found in: PA1 PA3 PA5 PA6 PA4 PA8
## p < 1e-3 found in: PA1 PA3 PA5 PA6 PA4 PA8
## p < 1e-4 found in: PA1 PA3 PA5 PA6 PA4 PA8
analyze_distributions(
broad_data$long %>% filter(subcorpus != "LiFRLaw"), "subcorpus"
)
##
## CzCDC FrBo KUKY LiFRLaw OmbuFlyers
## 211 307 194 0 38
## Saving 7 x 5 in image
##
## Test for the significance of differences in subcorpus over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 395.0676, df = 3, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | -18.94981
## | 0.0000*
## |
## KUKY | -5.093583 12.95203
## | 0.0000* 0.0000*
## |
## OmbuFlye | -5.882160 3.826214 -2.987223
## | 0.0000* 0.0008* 0.0169*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.527 (95% CI: 0.485 - 0.574 )
##
## Test for the significance of differences in subcorpus over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 4.3463, df = 3, p-value = 0.23
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | 0.037729
## | 1.0000
## |
## KUKY | 1.596816 1.694989
## | 0.6618 0.5405
## |
## OmbuFlye | -0.629049 -0.664238 -1.520227
## | 1.0000 1.0000 0.7707
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0058 (95% CI: 0.000972 - 0.0286 )
##
## Test for the significance of differences in subcorpus over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 28.8785, df = 3, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | 3.410353
## | 0.0039*
## |
## KUKY | -1.110008 -4.528926
## | 1.0000 0.0000*
## |
## OmbuFlye | -1.596167 -3.409067 -0.963214
## | 0.6627 0.0039* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0386 (95% CI: 0.0179 - 0.072 )
##
## Test for the significance of differences in subcorpus over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 154.436, df = 3, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | -11.26250
## | 0.0000*
## |
## KUKY | -9.949810 0.190224
## | 0.0000* 1.0000
## |
## OmbuFlye | -6.272823 -0.571540 -0.652392
## | 0.0000* 1.0000 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.206 (95% CI: 0.155 - 0.26 )
##
## Test for the significance of differences in subcorpus over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 92.2063, df = 3, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | 9.524438
## | 0.0000*
## |
## KUKY | 4.101429 -4.838276
## | 0.0002* 0.0000*
## |
## OmbuFlye | 3.102433 -1.773517 0.782204
## | 0.0115* 0.4569 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.123 (95% CI: 0.0851 - 0.17 )
##
## Test for the significance of differences in subcorpus over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 52.3748, df = 3, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | 6.339263
## | 0.0000*
## |
## KUKY | 3.787601 -2.073076
## | 0.0009* 0.2290
## |
## OmbuFlye | 5.236072 2.069230 3.077682
## | 0.0000* 0.2311 0.0125*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0699 (95% CI: 0.0398 - 0.114 )
##
## Test for the significance of differences in subcorpus over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 26.8487, df = 3, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | 1.445179
## | 0.8904
## |
## KUKY | 1.855833 0.603630
## | 0.3809 1.0000
## |
## OmbuFlye | 5.143837 4.519651 4.069209
## | 0.0000* 0.0000* 0.0003*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0358 (95% CI: 0.0189 - 0.0675 )
##
## Test for the significance of differences in subcorpus over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 8.4499, df = 3, p-value = 0.04
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | CzCDC FrBo KUKY
## ---------+---------------------------------
## FrBo | 1.584972
## | 0.6778
## |
## KUKY | 0.674188 -0.814182
## | 1.0000 1.0000
## |
## OmbuFlye | -1.843864 -2.713691 -2.209678
## | 0.3912 0.0399* 0.1628
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0113 (95% CI: 0.00327 - 0.0317 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 395.07 < .0001 0.485 0.527 0.574
## 2 PA2 4.35 0.23 0.001 0.006 0.029
## 3 PA3 28.88 < .0001 0.018 0.039 0.072
## 4 PA5 154.44 < .0001 0.155 0.206 0.260
## 5 PA6 92.21 < .0001 0.085 0.123 0.170
## 6 PA4 52.37 < .0001 0.040 0.070 0.114
## 7 PA8 26.85 < .0001 0.019 0.036 0.068
## 8 PA7 8.45 < .05 0.003 0.011 0.032
##
## p < 5e-2 found in: PA1 PA3 PA5 PA6 PA4 PA8 PA7
## p < 1e-2 found in: PA1 PA3 PA5 PA6 PA4 PA8
## p < 1e-3 found in: PA1 PA3 PA5 PA6 PA4 PA8
## p < 1e-4 found in: PA1 PA3 PA5 PA6 PA4 PA8
analyze_distributions(broad_data$long, "RecipientType")
##
## combined legal person natural person <NA>
## 304 23 413 13
## Saving 7 x 5 in image
##
## Test for the significance of differences in RecipientType over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 291.9381, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | -2.490666
## | 0.0383*
## |
## natural | -17.05905 -3.503143
## | 0.0000* 0.0014*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.388 (95% CI: 0.329 - 0.446 )
##
## Test for the significance of differences in RecipientType over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 18.4473, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | 3.716618
## | 0.0006*
## |
## natural | 2.856706 -2.743954
## | 0.0128* 0.0182*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0245 (95% CI: 0.00876 - 0.0498 )
##
## Test for the significance of differences in RecipientType over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 8.8831, df = 2, p-value = 0.01
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | 1.310350
## | 0.5702
## |
## natural | 2.885911 -0.304734
## | 0.0117* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0118 (95% CI: 0.00197 - 0.0356 )
##
## Test for the significance of differences in RecipientType over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 85.8505, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | -0.677274
## | 1.0000
## |
## natural | -9.187961 -2.557227
## | 0.0000* 0.0317*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.114 (95% CI: 0.0761 - 0.168 )
##
## Test for the significance of differences in RecipientType over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 106.4328, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | 1.168976
## | 0.7272
## |
## natural | 10.27569 2.444572
## | 0.0000* 0.0435*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.142 (95% CI: 0.101 - 0.2 )
##
## Test for the significance of differences in RecipientType over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 34.625, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | 2.122712
## | 0.1013
## |
## natural | 5.803025 -0.095787
## | 0.0000* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.046 (95% CI: 0.0216 - 0.0866 )
##
## Test for the significance of differences in RecipientType over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 15.308, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | 0.435293
## | 1.0000
## |
## natural | 3.896288 0.934950
## | 0.0003* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0204 (95% CI: 0.00652 - 0.0463 )
##
## Test for the significance of differences in RecipientType over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 3.8949, df = 2, p-value = 0.14
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | combined legal pe
## ---------+----------------------
## legal pe | 1.441846
## | 0.4480
## |
## natural | 1.610202 -0.887450
## | 0.3221 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00518 (95% CI: 0.000373 - 0.0228 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 291.94 < .0001 0.329 0.388 0.446
## 2 PA2 18.45 < .0001 0.009 0.024 0.050
## 3 PA3 8.88 < .05 0.002 0.012 0.036
## 4 PA5 85.85 < .0001 0.076 0.114 0.168
## 5 PA6 106.43 < .0001 0.101 0.142 0.200
## 6 PA4 34.62 < .0001 0.022 0.046 0.087
## 7 PA8 15.31 < .001 0.007 0.020 0.046
## 8 PA7 3.89 0.14 0.000 0.005 0.023
##
## p < 5e-2 found in: PA1 PA2 PA3 PA5 PA6 PA4 PA8
## p < 1e-2 found in: PA1 PA2 PA5 PA6 PA4 PA8
## p < 1e-3 found in: PA1 PA2 PA5 PA6 PA4 PA8
## p < 1e-4 found in: PA1 PA5 PA6 PA4
court decisions often with RecipientType = combined.
analyze_distributions(broad_data$long, "RecipientIndividuation")
##
## bulk individual public <NA>
## 69 356 319 9
## Saving 7 x 5 in image
##
## Test for the significance of differences in RecipientIndividuation over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 231.7611, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | -0.802883
## | 1.0000
## |
## public | -9.148637 -14.38526
## | 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.308 (95% CI: 0.255 - 0.368 )
##
## Test for the significance of differences in RecipientIndividuation over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 39.7178, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | 5.819968
## | 0.0000*
## |
## public | 3.480791 -3.935297
## | 0.0015* 0.0002*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0528 (95% CI: 0.0248 - 0.093 )
##
## Test for the significance of differences in RecipientIndividuation over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 6.1779, df = 2, p-value = 0.05
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | 0.583560
## | 1.0000
## |
## public | 1.832342 2.159889
## | 0.2007 0.0923
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00822 (95% CI: 0.00103 - 0.0266 )
##
## Test for the significance of differences in RecipientIndividuation over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 117.9317, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | 5.787178
## | 0.0000*
## |
## public | -0.324537 -10.43260
## | 1.0000 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.157 (95% CI: 0.113 - 0.211 )
##
## Test for the significance of differences in RecipientIndividuation over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 46.2243, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | 1.848811
## | 0.1935
## |
## public | 5.184785 5.774522
## | 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0615 (95% CI: 0.034 - 0.099 )
##
## Test for the significance of differences in RecipientIndividuation over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 5.8732, df = 2, p-value = 0.05
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | -0.767062
## | 1.0000
## |
## public | 0.646113 2.421398
## | 1.0000 0.0464*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00781 (95% CI: 0.000906 - 0.0274 )
##
## Test for the significance of differences in RecipientIndividuation over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 3.3278, df = 2, p-value = 0.19
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | 0.967678
## | 0.9996
## |
## public | 1.665889 1.217876
## | 0.2872 0.6698
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00443 (95% CI: 0.000391 - 0.0212 )
##
## Test for the significance of differences in RecipientIndividuation over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 25.8542, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | bulk individu
## ---------+----------------------
## individu | 1.186493
## | 0.7063
## |
## public | 3.743362 4.422216
## | 0.0005* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0344 (95% CI: 0.0153 - 0.0683 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 231.76 < .0001 0.255 0.308 0.368
## 2 PA2 39.72 < .0001 0.025 0.053 0.093
## 3 PA3 6.18 < .05 0.001 0.008 0.027
## 4 PA5 117.93 < .0001 0.113 0.157 0.211
## 5 PA6 46.22 < .0001 0.034 0.062 0.099
## 6 PA4 5.87 0.05 0.001 0.008 0.027
## 7 PA8 3.33 0.19 0.000 0.004 0.021
## 8 PA7 25.85 < .0001 0.015 0.034 0.068
##
## p < 5e-2 found in: PA1 PA2 PA5 PA6 PA4 PA7
## p < 1e-2 found in: PA1 PA2 PA5 PA6 PA7
## p < 1e-3 found in: PA1 PA2 PA5 PA6 PA7
## p < 1e-4 found in: PA1 PA2 PA5 PA6 PA7
analyze_distributions(broad_data$long, "Objectivity")
##
## persuasive quasiobjective <NA>
## 21 729 3
## Saving 7 x 5 in image
##
## Test for the significance of differences in Objectivity over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 0.2128, df = 1, p-value = 0.64
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | -0.461269
## | 0.6446
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.000283 (95% CI: 9.09e-07 - 0.00636 )
##
## Test for the significance of differences in Objectivity over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 5.7127, df = 1, p-value = 0.02
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | -2.390123
## | 0.0168*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0076 (95% CI: 0.000545 - 0.0243 )
##
## Test for the significance of differences in Objectivity over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 0.7303, df = 1, p-value = 0.39
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | -0.854600
## | 0.3928
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.000971 (95% CI: 2.12e-06 - 0.0119 )
##
## Test for the significance of differences in Objectivity over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 2.7409, df = 1, p-value = 0.1
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | -1.655565
## | 0.0978
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00364 (95% CI: 2.05e-05 - 0.0215 )
##
## Test for the significance of differences in Objectivity over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 0.7585, df = 1, p-value = 0.38
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | 0.870946
## | 0.3838
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00101 (95% CI: 4.3e-06 - 0.0154 )
##
## Test for the significance of differences in Objectivity over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 0.7044, df = 1, p-value = 0.4
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | -0.839276
## | 0.4013
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.000937 (95% CI: 2.29e-06 - 0.0122 )
##
## Test for the significance of differences in Objectivity over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 0.314, df = 1, p-value = 0.58
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | 0.560368
## | 0.5752
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.000418 (95% CI: 1.39e-06 - 0.00914 )
##
## Test for the significance of differences in Objectivity over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 2.8021, df = 1, p-value = 0.09
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | persuasi
## ---------+-----------
## quasiobj | -1.673954
## | 0.0941
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00373 (95% CI: 1.63e-05 - 0.0174 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 0.21 0.64 0.000 0.000 0.006
## 2 PA2 5.71 < .05 0.001 0.008 0.024
## 3 PA3 0.73 0.39 0.000 0.001 0.012
## 4 PA5 2.74 0.1 0.000 0.004 0.021
## 5 PA6 0.76 0.38 0.000 0.001 0.015
## 6 PA4 0.70 0.4 0.000 0.001 0.012
## 7 PA8 0.31 0.58 0.000 0.000 0.009
## 8 PA7 2.80 0.09 0.000 0.004 0.017
##
## p < 5e-2 found in: PA2
## p < 1e-2 found in:
## p < 1e-3 found in:
## p < 1e-4 found in:
analyze_distributions(broad_data$long, "Bindingness")
##
## FALSE TRUE <NA>
## 444 303 6
## Saving 7 x 5 in image
##
## Test for the significance of differences in Bindingness over PA1 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 380.9685, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | 19.51841
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.507 (95% CI: 0.453 - 0.557 )
##
## Test for the significance of differences in Bindingness over PA2 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 0.0529, df = 1, p-value = 0.82
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | -0.229985
## | 0.8181
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 7.03e-05 (95% CI: 8.14e-07 - 0.0072 )
##
## Test for the significance of differences in Bindingness over PA3 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 7.2737, df = 1, p-value = 0.01
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | -2.696982
## | 0.0070*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00967 (95% CI: 0.000436 - 0.0282 )
##
## Test for the significance of differences in Bindingness over PA5 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 118.7006, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | 10.89497
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.158 (95% CI: 0.111 - 0.206 )
##
## Test for the significance of differences in Bindingness over PA6 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 49.5439, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | -7.038743
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0659 (95% CI: 0.0337 - 0.109 )
##
## Test for the significance of differences in Bindingness over PA4 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 30.6385, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | -5.535201
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0407 (95% CI: 0.0165 - 0.0724 )
##
## Test for the significance of differences in Bindingness over PA8 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 9.501, df = 1, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | -3.082363
## | 0.0021*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.0126 (95% CI: 0.0017 - 0.0331 )
##
## Test for the significance of differences in Bindingness over PA7 :
##
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 5.5352, df = 1, p-value = 0.02
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | FALSE
## ---------+-----------
## TRUE | -2.352693
## | 0.0186*
##
## alpha = 0.05
## Reject Ho if p <= alpha
## epsilon2 = 0.00736 (95% CI: 0.000321 - 0.0247 )
##
## factor chi2 kruskal_p epsilon2_lci epsilon2 epsilon2_uci
## 1 PA1 380.97 < .0001 0.453 0.507 0.557
## 2 PA2 0.05 0.82 0.000 0.000 0.007
## 3 PA3 7.27 < .01 0.000 0.010 0.028
## 4 PA5 118.70 < .0001 0.111 0.158 0.206
## 5 PA6 49.54 < .0001 0.034 0.066 0.109
## 6 PA4 30.64 < .0001 0.016 0.041 0.072
## 7 PA8 9.50 < .01 0.002 0.013 0.033
## 8 PA7 5.54 < .05 0.000 0.007 0.025
##
## p < 5e-2 found in: PA1 PA3 PA5 PA6 PA4 PA8 PA7
## p < 1e-2 found in: PA1 PA3 PA5 PA6 PA4 PA8
## p < 1e-3 found in: PA1 PA5 PA6 PA4
## p < 1e-4 found in: PA1 PA5 PA6 PA4
broad_data_factors_corr <- broad_data$feat_long %>%
group_by(feat, factor) %>%
summarize(correlation = cor(feat_value, factor_score))
## `summarise()` has grouped output by 'feat'. You can override using the
## `.groups` argument.
broad_data_factors_corr %>%
filter(feat %in% rownames(fa_broad$loadings)) %>%
ggplot(aes(
x = factor,
y = feat,
fill = correlation,
label = round(correlation, 2)
)) +
geom_tile() +
geom_text() +
scale_fill_gradient2()
broad_data_factors_corr %>%
filter(!(feat %in% rownames(fa_broad$loadings))) %>%
ggplot(aes(
x = factor,
y = feat,
fill = correlation,
label = round(correlation, 2)
)) +
geom_tile() +
geom_text() +
scale_fill_gradient2() +
labs(x = "factors", y = "variables") +
theme_minimal()
ggsave("varfactcorr.pdf")
## Saving 7 x 9 in image
eigen <- eigen(cor(data_scaled))
par <- nFactors::parallel(
subject = nrow(data_scaled),
var = ncol(data_scaled),
rep = 100,
quantile = .95,
model = "factors"
)
scree <- nScree(x = eigen$values, aparallel = par$eigen$qevpea)
plotnScree(scree)
fa.parallel(data_scaled, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 8 and the number of components = NA
https://www.rdocumentation.org/packages/psych/versions/2.5.3/topics/fa
set.seed(42)
fa_1 <- fa(
data_scaled,
nfactors = 8,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
fa_1
## Factor Analysis with confidence intervals using method = fa(r = data_scaled, nfactors = 8, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_scaled, nfactors = 8, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7 h2 u2
## sentlen.m -0.62 -0.02 -0.03 -0.28 0.00 0.37 0.15 -0.02 0.94 0.063
## sentcount 0.15 0.96 0.03 0.32 -0.07 -0.16 0.00 -0.01 0.93 0.066
## atl 0.70 0.00 -0.02 0.06 -0.05 -0.13 0.10 0.30 0.57 0.431
## activity 0.66 -0.01 0.10 0.47 0.00 0.31 -0.09 -0.09 0.89 0.106
## VERBfrac.m 0.80 -0.06 0.20 0.35 -0.02 0.10 -0.12 -0.05 0.90 0.100
## wordcount -0.15 0.95 0.00 0.01 0.02 0.00 -0.05 0.01 0.89 0.114
## entropy 0.03 0.72 0.07 -0.02 0.10 -0.04 -0.12 0.39 0.86 0.141
## sentlen.v 0.00 -0.01 0.73 0.28 0.01 -0.15 0.05 -0.02 0.46 0.538
## predsubjdist.m -0.08 -0.04 0.25 0.12 -0.04 0.06 0.55 -0.04 0.45 0.555
## compoundVERBs 0.99 -0.15 0.30 -0.31 0.07 -0.18 -0.14 -0.04 0.70 0.298
## passives 0.03 -0.09 -0.03 -0.79 0.15 -0.25 -0.06 -0.09 0.57 0.427
## predobjdist.m 0.08 -0.12 0.60 0.01 -0.05 -0.08 0.29 0.00 0.42 0.583
## literary 0.00 -0.04 0.07 -0.34 0.15 0.14 -0.05 0.06 0.24 0.758
## verbdist -0.74 0.00 0.00 -0.12 -0.06 -0.25 0.26 -0.04 0.81 0.188
## maentropy -0.19 -0.07 -0.15 -0.03 0.12 -0.01 -0.01 0.82 0.76 0.245
## predorder.m -0.45 -0.07 0.06 0.06 -0.04 0.19 0.51 0.07 0.70 0.297
## hapaxes 0.10 -0.83 0.07 0.07 0.01 -0.10 0.01 0.29 0.72 0.282
## VERBcomp 0.56 0.02 -0.01 0.15 -0.15 0.54 -0.01 0.04 0.60 0.404
## NOUNcount.v -0.33 -0.04 0.43 -0.08 -0.05 0.01 -0.22 -0.03 0.41 0.594
## subj 0.69 0.12 -0.14 -0.04 0.11 -0.02 0.13 -0.14 0.58 0.422
## NOUNcount.m -0.84 0.05 0.01 -0.08 -0.17 -0.10 0.14 0.07 0.79 0.209
## predobjdist.v 0.05 0.14 0.51 -0.07 0.07 0.04 0.07 0.02 0.39 0.606
## NEGcount.m 0.04 -0.05 -0.06 0.08 1.00 0.08 0.03 0.09 0.94 0.063
## compoundVERBsdist.m 0.13 -0.02 0.71 -0.14 -0.08 -0.04 -0.03 -0.14 0.43 0.566
## VERBfrac.v -0.55 -0.03 0.15 0.23 -0.04 -0.21 -0.06 0.06 0.35 0.648
## NEGcount.v 0.21 0.09 0.01 -0.03 0.75 0.02 -0.11 0.07 0.59 0.415
## compoundVERBsdist.v -0.07 0.23 0.28 -0.20 0.04 0.00 0.06 -0.03 0.33 0.672
## predsubjdist.v -0.14 0.10 0.38 -0.03 0.10 0.13 0.17 0.03 0.47 0.533
## mamr 0.84 -0.07 -0.06 0.02 0.01 0.02 0.16 -0.17 0.77 0.234
## obj 0.08 -0.03 -0.06 0.00 0.08 0.83 0.10 -0.02 0.68 0.322
## predorder.v -0.05 -0.02 0.52 -0.05 0.07 0.16 0.17 0.08 0.54 0.463
## verbalNOUNs 0.23 0.05 -0.02 -0.12 -0.14 -0.18 0.00 0.04 0.14 0.862
## NEGfrac.m -0.03 -0.02 -0.03 0.60 0.29 -0.21 0.09 -0.09 0.40 0.602
## com
## sentlen.m 2.2
## sentcount 1.3
## atl 1.5
## activity 2.4
## VERBfrac.m 1.6
## wordcount 1.1
## entropy 1.7
## sentlen.v 1.4
## predsubjdist.m 1.6
## compoundVERBs 1.6
## passives 1.4
## predobjdist.m 1.6
## literary 2.0
## verbdist 1.6
## maentropy 1.2
## predorder.m 2.4
## hapaxes 1.3
## VERBcomp 2.3
## NOUNcount.v 2.6
## subj 1.4
## NOUNcount.m 1.2
## predobjdist.v 1.3
## NEGcount.m 1.1
## compoundVERBsdist.m 1.3
## VERBfrac.v 1.9
## NEGcount.v 1.3
## compoundVERBsdist.v 3.1
## predsubjdist.v 2.4
## mamr 1.2
## obj 1.1
## predorder.v 1.6
## verbalNOUNs 3.4
## NEGfrac.m 1.9
##
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7
## SS loadings 6.71 3.10 2.53 2.08 1.74 1.56 1.29 1.19
## Proportion Var 0.20 0.09 0.08 0.06 0.05 0.05 0.04 0.04
## Cumulative Var 0.20 0.30 0.37 0.44 0.49 0.54 0.58 0.61
## Proportion Explained 0.33 0.15 0.13 0.10 0.09 0.08 0.06 0.06
## Cumulative Proportion 0.33 0.49 0.61 0.71 0.80 0.88 0.94 1.00
##
## With factor correlations of
## PA1 PA2 PA3 PA5 PA6 PA4 PA8 PA7
## PA1 1.00 0.11 -0.56 0.38 -0.37 -0.18 -0.36 -0.17
## PA2 0.11 1.00 0.17 -0.26 0.27 0.25 0.01 0.18
## PA3 -0.56 0.17 1.00 -0.33 0.30 0.32 0.24 0.11
## PA5 0.38 -0.26 -0.33 1.00 -0.34 -0.23 -0.38 -0.17
## PA6 -0.37 0.27 0.30 -0.34 1.00 0.32 0.11 0.07
## PA4 -0.18 0.25 0.32 -0.23 0.32 1.00 0.00 0.08
## PA8 -0.36 0.01 0.24 -0.38 0.11 0.00 1.00 -0.10
## PA7 -0.17 0.18 0.11 -0.17 0.07 0.08 -0.10 1.00
##
## Mean item complexity = 1.7
## Test of the hypothesis that 8 factors are sufficient.
##
## df null model = 528 with the objective function = 24.21 with Chi Square = 17922.49
## df of the model are 292 and the objective function was 2.94
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 753 with the empirical chi square 514.88 with prob < 1.6e-14
## The total n.obs was 753 with Likelihood Chi Square = 2157.52 with prob < 2.7e-281
##
## Tucker Lewis Index of factoring reliability = 0.805
## RMSEA index = 0.092 and the 90 % confidence intervals are 0.089 0.096
## BIC = 223.3
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## PA1 PA2 PA3 PA5 PA6 PA4
## Correlation of (regression) scores with factors 0.98 0.98 0.92 0.94 0.98 0.94
## Multiple R square of scores with factors 0.96 0.96 0.85 0.89 0.96 0.89
## Minimum correlation of possible factor scores 0.92 0.92 0.70 0.77 0.91 0.78
## PA8 PA7
## Correlation of (regression) scores with factors 0.87 0.91
## Multiple R square of scores with factors 0.75 0.82
## Minimum correlation of possible factor scores 0.50 0.65
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA3 upper low
## sentlen.m -0.70 -0.62 -0.52 -0.06 -0.02 0.00 -0.09 -0.03 0.05 -0.33
## sentcount 0.09 0.15 0.20 0.92 0.96 1.01 0.00 0.03 0.07 0.25
## atl 0.52 0.70 0.76 -0.05 0.00 0.07 -0.11 -0.02 0.08 -0.06
## activity 0.56 0.66 0.77 -0.04 -0.01 0.02 0.03 0.10 0.15 0.40
## VERBfrac.m 0.66 0.80 0.94 -0.09 -0.06 -0.01 0.13 0.20 0.25 0.27
## wordcount -0.19 -0.15 -0.09 0.91 0.95 0.98 -0.04 0.00 0.04 -0.02
## entropy -0.03 0.03 0.07 0.68 0.72 0.76 0.02 0.07 0.11 -0.07
## sentlen.v -0.09 0.00 0.06 -0.07 -0.01 0.07 0.60 0.73 0.88 0.19
## predsubjdist.m -0.30 -0.08 0.04 -0.09 -0.04 0.02 0.15 0.25 0.39 -0.04
## compoundVERBs 0.81 0.99 1.17 -0.22 -0.15 -0.08 0.19 0.30 0.39 -0.40
## passives -0.04 0.03 0.10 -0.14 -0.09 -0.05 -0.09 -0.03 0.04 -0.85
## predobjdist.m -0.08 0.08 0.18 -0.18 -0.12 -0.05 0.44 0.60 0.80 -0.16
## literary -0.10 0.00 0.12 -0.13 -0.04 0.04 -0.04 0.07 0.17 -0.42
## verbdist -0.87 -0.74 -0.65 -0.03 0.00 0.02 -0.04 0.00 0.06 -0.27
## maentropy -0.30 -0.19 -0.15 -0.09 -0.07 -0.01 -0.23 -0.15 -0.10 -0.12
## predorder.m -0.73 -0.45 -0.30 -0.11 -0.07 0.00 -0.03 0.06 0.17 -0.11
## hapaxes 0.01 0.10 0.15 -0.86 -0.83 -0.78 0.01 0.07 0.11 0.00
## VERBcomp 0.44 0.56 0.65 -0.03 0.02 0.07 -0.08 -0.01 0.06 0.07
## NOUNcount.v -0.40 -0.33 -0.16 -0.12 -0.04 0.03 0.27 0.43 0.60 -0.14
## subj 0.55 0.69 0.76 0.08 0.12 0.18 -0.20 -0.14 -0.09 -0.14
## NOUNcount.m -0.98 -0.84 -0.70 0.00 0.05 0.09 -0.06 0.01 0.10 -0.15
## predobjdist.v -0.10 0.05 0.17 0.05 0.14 0.26 0.38 0.51 0.65 -0.16
## NEGcount.m -0.04 0.04 0.07 -0.09 -0.05 -0.01 -0.12 -0.06 -0.02 -0.01
## compoundVERBsdist.m 0.03 0.13 0.25 -0.09 -0.02 0.06 0.59 0.71 0.85 -0.21
## VERBfrac.v -0.65 -0.55 -0.42 -0.10 -0.03 0.04 0.05 0.15 0.24 0.12
## NEGcount.v 0.16 0.21 0.29 0.04 0.09 0.13 -0.04 0.01 0.06 -0.10
## compoundVERBsdist.v -0.19 -0.07 0.04 0.16 0.23 0.31 0.14 0.28 0.43 -0.32
## predsubjdist.v -0.30 -0.14 -0.03 0.04 0.10 0.16 0.25 0.38 0.52 -0.13
## mamr 0.69 0.84 0.90 -0.12 -0.07 -0.01 -0.13 -0.06 0.02 -0.07
## obj 0.01 0.08 0.14 -0.07 -0.03 0.02 -0.12 -0.06 0.01 -0.06
## predorder.v -0.23 -0.05 0.07 -0.09 -0.02 0.07 0.31 0.52 0.73 -0.16
## verbalNOUNs 0.10 0.23 0.34 -0.03 0.05 0.12 -0.13 -0.02 0.09 -0.21
## NEGfrac.m -0.16 -0.03 0.06 -0.08 -0.02 0.04 -0.12 -0.03 0.05 0.48
## PA5 upper low PA6 upper low PA4 upper low PA8
## sentlen.m -0.28 -0.22 -0.03 0.00 0.05 0.32 0.37 0.43 0.08 0.15
## sentcount 0.32 0.35 -0.11 -0.07 -0.03 -0.20 -0.16 -0.13 -0.17 0.00
## atl 0.06 0.12 -0.15 -0.05 0.04 -0.25 -0.13 -0.05 -0.36 0.10
## activity 0.47 0.54 -0.04 0.00 0.04 0.27 0.31 0.37 -0.22 -0.09
## VERBfrac.m 0.35 0.43 -0.06 -0.02 0.03 0.06 0.10 0.15 -0.41 -0.12
## wordcount 0.01 0.06 -0.01 0.02 0.06 -0.04 0.00 0.03 -0.13 -0.05
## entropy -0.02 0.02 0.07 0.10 0.15 -0.09 -0.04 -0.01 -0.38 -0.12
## sentlen.v 0.28 0.35 -0.07 0.01 0.07 -0.21 -0.15 -0.10 -0.07 0.05
## predsubjdist.m 0.12 0.22 -0.11 -0.04 0.05 -0.05 0.06 0.18 0.16 0.55
## compoundVERBs -0.31 -0.21 0.01 0.07 0.14 -0.25 -0.18 -0.10 -0.44 -0.14
## passives -0.79 -0.66 0.09 0.15 0.22 -0.31 -0.25 -0.19 -0.15 -0.06
## predobjdist.m 0.01 0.09 -0.17 -0.05 0.03 -0.16 -0.08 0.02 -0.16 0.29
## literary -0.34 -0.23 0.07 0.15 0.25 0.05 0.14 0.23 -0.16 -0.05
## verbdist -0.12 -0.01 -0.10 -0.06 -0.02 -0.30 -0.25 -0.20 0.19 0.26
## maentropy -0.03 0.01 0.07 0.12 0.19 -0.07 -0.01 0.04 -0.36 -0.01
## predorder.m 0.06 0.14 -0.14 -0.04 0.06 0.06 0.19 0.28 0.22 0.51
## hapaxes 0.07 0.12 -0.05 0.01 0.05 -0.15 -0.10 -0.05 -0.14 0.01
## VERBcomp 0.15 0.22 -0.21 -0.15 -0.07 0.47 0.54 0.64 -0.23 -0.01
## NOUNcount.v -0.08 0.06 -0.12 -0.05 0.06 -0.08 0.01 0.10 -0.40 -0.22
## subj -0.04 0.00 0.03 0.11 0.17 -0.09 -0.02 0.04 -0.09 0.13
## NOUNcount.m -0.08 -0.01 -0.25 -0.17 -0.09 -0.17 -0.10 -0.04 0.03 0.14
## predobjdist.v -0.07 0.02 -0.02 0.07 0.14 -0.04 0.04 0.15 -0.18 0.07
## NEGcount.m 0.08 0.12 0.84 1.00 1.12 0.04 0.08 0.15 -0.12 0.03
## compoundVERBsdist.m -0.14 -0.05 -0.15 -0.08 -0.01 -0.10 -0.04 0.02 -0.17 -0.03
## VERBfrac.v 0.23 0.36 -0.13 -0.04 0.07 -0.31 -0.21 -0.13 -0.36 -0.06
## NEGcount.v -0.03 0.05 0.65 0.75 0.92 -0.03 0.02 0.08 -0.30 -0.11
## compoundVERBsdist.v -0.20 -0.08 -0.04 0.04 0.12 -0.08 0.00 0.10 -0.09 0.06
## predsubjdist.v -0.03 0.08 0.01 0.10 0.19 0.05 0.13 0.22 -0.05 0.17
## mamr 0.02 0.06 -0.08 0.01 0.06 -0.05 0.02 0.08 -0.07 0.16
## obj 0.00 0.05 0.04 0.08 0.15 0.75 0.83 0.94 0.00 0.10
## predorder.v -0.05 0.05 -0.02 0.07 0.15 0.07 0.16 0.26 -0.02 0.17
## verbalNOUNs -0.12 -0.03 -0.25 -0.14 -0.03 -0.29 -0.18 -0.08 -0.23 0.00
## NEGfrac.m 0.60 0.65 0.19 0.29 0.39 -0.28 -0.21 -0.15 -0.16 0.09
## upper low PA7 upper
## sentlen.m 0.40 -0.05 -0.02 0.04
## sentcount 0.06 -0.07 -0.01 0.02
## atl 0.31 0.16 0.30 0.41
## activity -0.02 -0.16 -0.09 -0.04
## VERBfrac.m 0.05 -0.12 -0.05 0.01
## wordcount 0.00 -0.03 0.01 0.06
## entropy -0.04 0.33 0.39 0.55
## sentlen.v 0.27 -0.10 -0.02 0.06
## predsubjdist.m 1.26 -0.31 -0.04 0.16
## compoundVERBs 0.02 -0.11 -0.04 0.04
## passives 0.07 -0.15 -0.09 -0.02
## predobjdist.m 0.83 -0.16 0.00 0.10
## literary 0.09 -0.01 0.06 0.16
## verbdist 0.47 -0.11 -0.04 -0.01
## maentropy 0.07 0.70 0.82 1.03
## predorder.m 0.97 -0.10 0.07 0.13
## hapaxes 0.06 0.22 0.29 0.38
## VERBcomp 0.11 -0.03 0.04 0.11
## NOUNcount.v 0.05 -0.13 -0.03 0.13
## subj 0.23 -0.31 -0.14 -0.09
## NOUNcount.m 0.36 -0.01 0.07 0.15
## predobjdist.v 0.39 -0.10 0.02 0.14
## NEGcount.m 0.13 0.04 0.09 0.16
## compoundVERBsdist.m 0.21 -0.27 -0.14 -0.05
## VERBfrac.v 0.29 -0.04 0.06 0.23
## NEGcount.v 0.00 0.02 0.07 0.18
## compoundVERBsdist.v 0.28 -0.15 -0.03 0.09
## predsubjdist.v 0.57 -0.11 0.03 0.11
## mamr 0.27 -0.34 -0.17 -0.13
## obj 0.25 -0.09 -0.02 0.03
## predorder.v 0.47 -0.04 0.08 0.19
## verbalNOUNs 0.16 -0.08 0.04 0.15
## NEGfrac.m 0.32 -0.24 -0.09 -0.02
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 -0.289 0.1108 0.408
## PA1-PA3 -0.962 -0.5622 0.047
## PA1-PA5 -0.857 0.3830 0.368
## PA1-PA6 -0.722 -0.3665 0.056
## PA1-PA4 -0.619 -0.1818 0.117
## PA1-PA8 -0.556 -0.3611 0.180
## PA1-PA7 -0.466 -0.1660 0.165
## PA2-PA3 -0.020 0.1702 0.334
## PA2-PA5 -0.294 -0.2586 0.563
## PA2-PA6 -0.178 0.2683 0.511
## PA2-PA4 -0.113 0.2463 0.481
## PA2-PA8 -0.181 0.0064 0.437
## PA2-PA7 -0.188 0.1785 0.339
## PA3-PA5 -0.399 -0.3255 0.750
## PA3-PA6 -0.132 0.3000 0.669
## PA3-PA4 -0.093 0.3241 0.596
## PA3-PA8 -0.149 0.2427 0.556
## PA3-PA7 -0.238 0.1085 0.410
## PA5-PA6 -0.481 -0.3378 0.737
## PA5-PA4 -0.387 -0.2304 0.645
## PA5-PA8 -0.294 -0.3838 0.452
## PA5-PA7 -0.277 -0.1659 0.336
## PA6-PA4 -0.149 0.3221 0.488
## PA6-PA8 -0.209 0.1114 0.423
## PA6-PA7 -0.253 0.0710 0.324
## PA4-PA8 -0.179 -0.0029 0.389
## PA4-PA7 -0.182 0.0752 0.254
## PA8-PA7 -0.299 -0.1047 0.306
fa_1$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_scaled)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 33 × 2
## feat maxload
## <chr> <dbl>
## 1 verbalNOUNs 0.232
## 2 compoundVERBsdist.v 0.281
## 3 literary 0.343
## 4 predsubjdist.v 0.377
## 5 NOUNcount.v 0.431
## 6 predobjdist.v 0.509
## 7 predorder.m 0.515
## 8 predorder.v 0.519
## 9 VERBfrac.v 0.549
## 10 predsubjdist.m 0.551
## # ℹ 23 more rows
fa_1$communality %>% sort()
## verbalNOUNs literary compoundVERBsdist.v VERBfrac.v
## 0.1379713 0.2423431 0.3280540 0.3524992
## predobjdist.v NEGfrac.m NOUNcount.v predobjdist.m
## 0.3939045 0.3975483 0.4064061 0.4169727
## compoundVERBsdist.m predsubjdist.m sentlen.v predsubjdist.v
## 0.4336188 0.4453023 0.4615505 0.4669617
## predorder.v atl passives subj
## 0.5370148 0.5694476 0.5733804 0.5775257
## NEGcount.v VERBcomp obj compoundVERBs
## 0.5854885 0.5958715 0.6784960 0.7020210
## predorder.m hapaxes maentropy mamr
## 0.7030408 0.7184036 0.7553256 0.7664031
## NOUNcount.m verbdist entropy wordcount
## 0.7910351 0.8118113 0.8591141 0.8864995
## activity VERBfrac.m sentcount sentlen.m
## 0.8937370 0.8998234 0.9344065 0.9365817
## NEGcount.m
## 0.9365996
fa_1$communality[fa_1$communality < 0.5] %>% names()
## [1] "sentlen.v" "predsubjdist.m" "predobjdist.m"
## [4] "literary" "NOUNcount.v" "predobjdist.v"
## [7] "compoundVERBsdist.m" "VERBfrac.v" "compoundVERBsdist.v"
## [10] "predsubjdist.v" "verbalNOUNs" "NEGfrac.m"
fa_1$complexity %>% sort()
## wordcount NEGcount.m obj mamr
## 1.058480 1.059835 1.079227 1.183128
## NOUNcount.m maentropy NEGcount.v compoundVERBsdist.m
## 1.203656 1.249629 1.261795 1.268893
## predobjdist.v hapaxes sentcount passives
## 1.333335 1.333578 1.346796 1.350058
## subj sentlen.v atl predorder.v
## 1.372625 1.381042 1.509559 1.551827
## verbdist compoundVERBs VERBfrac.m predobjdist.m
## 1.558892 1.579530 1.616498 1.633887
## predsubjdist.m entropy NEGfrac.m VERBfrac.v
## 1.647062 1.696694 1.871425 1.926064
## literary sentlen.m VERBcomp predsubjdist.v
## 1.976897 2.244205 2.308159 2.404788
## predorder.m activity NOUNcount.v compoundVERBsdist.v
## 2.412118 2.434222 2.574050 3.113858
## verbalNOUNs
## 3.371824
fa_1$complexity[fa_1$complexity > 2] %>% names()
## [1] "sentlen.m" "activity" "predorder.m"
## [4] "VERBcomp" "NOUNcount.v" "compoundVERBsdist.v"
## [7] "predsubjdist.v" "verbalNOUNs"
data_engineered_1 <- data_scaled %>%
# remove low-communality variables
select(!c(
sentlen.v, predsubjdist.m, predobjdist.m,
literary, NOUNcount.v, predobjdist.v,
compoundVERBsdist.m, VERBfrac.v, compoundVERBsdist.v,
predsubjdist.v, verbalNOUNs, NEGfrac.m
))
det(cor(data_engineered_1))
## [1] 1.165238e-08
KMO(data_engineered_1)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_engineered_1)
## Overall MSA = 0.85
## MSA for each item =
## sentlen.m sentcount atl activity VERBfrac.m
## 0.88 0.71 0.88 0.88 0.91
## wordcount entropy compoundVERBs passives verbdist
## 0.70 0.72 0.91 0.80 0.92
## maentropy predorder.m hapaxes VERBcomp subj
## 0.60 0.88 0.80 0.88 0.95
## NOUNcount.m NEGcount.m NEGcount.v mamr obj
## 0.92 0.75 0.67 0.92 0.60
## predorder.v
## 0.88
fa.parallel(data_engineered_1, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 6 and the number of components = NA
set.seed(42)
fa_2 <- fa(
data_engineered_1,
nfactors = 6,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
fa_2
## Factor Analysis with confidence intervals using method = fa(r = data_engineered_1, nfactors = 6, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_engineered_1, nfactors = 6, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA4 PA3 PA6 PA5 h2 u2 com
## sentlen.m -0.73 -0.01 -0.02 0.43 0.22 0.02 0.96 0.039 1.8
## sentcount 0.19 0.92 -0.07 -0.18 -0.20 -0.06 0.92 0.084 1.3
## atl 0.67 0.03 -0.09 -0.11 0.01 0.20 0.48 0.519 1.3
## activity 0.64 -0.04 0.03 0.23 -0.39 -0.10 0.87 0.127 2.0
## VERBfrac.m 0.79 -0.04 -0.03 0.10 -0.23 -0.06 0.88 0.120 1.2
## wordcount -0.12 0.93 0.01 0.02 0.02 0.03 0.89 0.111 1.0
## entropy 0.07 0.74 0.04 0.04 0.07 0.43 0.87 0.127 1.7
## compoundVERBs 0.91 -0.04 -0.02 0.02 0.38 0.02 0.62 0.375 1.3
## passives 0.10 -0.02 0.01 -0.02 0.81 0.03 0.59 0.413 1.0
## verbdist -0.84 0.00 -0.08 -0.21 0.11 -0.09 0.78 0.218 1.2
## maentropy -0.08 -0.05 0.01 0.04 0.05 0.87 0.78 0.215 1.0
## predorder.m -0.72 -0.03 -0.13 0.21 0.06 -0.04 0.59 0.411 1.3
## hapaxes 0.10 -0.80 -0.03 -0.08 -0.05 0.28 0.70 0.295 1.3
## VERBcomp 0.53 0.03 -0.14 0.50 -0.15 0.07 0.60 0.402 2.3
## subj 0.73 0.12 0.01 0.00 0.19 -0.14 0.52 0.481 1.3
## NOUNcount.m -0.94 0.05 -0.12 -0.11 -0.05 0.02 0.80 0.204 1.1
## NEGcount.m -0.07 -0.06 0.85 0.11 0.00 0.01 0.80 0.196 1.1
## NEGcount.v 0.16 0.07 0.81 0.02 0.01 0.00 0.68 0.316 1.1
## mamr 0.82 -0.05 -0.09 0.04 0.16 -0.21 0.72 0.275 1.2
## obj -0.05 -0.02 0.10 0.75 -0.03 0.01 0.62 0.385 1.0
## predorder.v -0.45 0.10 -0.01 0.27 0.08 0.02 0.35 0.654 1.8
##
## PA1 PA2 PA4 PA3 PA6 PA5
## SS loadings 6.81 2.93 1.50 1.32 1.28 1.20
## Proportion Var 0.32 0.14 0.07 0.06 0.06 0.06
## Cumulative Var 0.32 0.46 0.54 0.60 0.66 0.72
## Proportion Explained 0.45 0.20 0.10 0.09 0.08 0.08
## Cumulative Proportion 0.45 0.65 0.75 0.84 0.92 1.00
##
## With factor correlations of
## PA1 PA2 PA4 PA3 PA6 PA5
## PA1 1.00 0.09 -0.24 -0.09 -0.50 -0.21
## PA2 0.09 1.00 0.31 0.22 0.06 0.13
## PA4 -0.24 0.31 1.00 0.26 0.31 0.22
## PA3 -0.09 0.22 0.26 1.00 -0.04 -0.07
## PA6 -0.50 0.06 0.31 -0.04 1.00 0.04
## PA5 -0.21 0.13 0.22 -0.07 0.04 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 6 factors are sufficient.
##
## df null model = 210 with the objective function = 18.27 with Chi Square = 13594.25
## df of the model are 99 and the objective function was 1.66
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic n.obs is 753 with the empirical chi square 195.42 with prob < 2.7e-08
## The total n.obs was 753 with Likelihood Chi Square = 1227.08 with prob < 7.1e-194
##
## Tucker Lewis Index of factoring reliability = 0.82
## RMSEA index = 0.123 and the 90 % confidence intervals are 0.117 0.129
## BIC = 571.3
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA4 PA3 PA6 PA5
## Correlation of (regression) scores with factors 0.99 0.98 0.93 0.95 0.91 0.92
## Multiple R square of scores with factors 0.97 0.96 0.87 0.90 0.83 0.85
## Minimum correlation of possible factor scores 0.94 0.92 0.73 0.80 0.66 0.70
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA4 upper low PA3
## sentlen.m -0.77 -0.73 -0.66 -0.04 -0.01 0.02 -0.04 -0.02 0.05 0.36 0.43
## sentcount 0.15 0.19 0.23 0.89 0.92 0.96 -0.15 -0.07 -0.01 -0.23 -0.18
## atl 0.57 0.67 0.75 -0.03 0.03 0.09 -0.27 -0.09 0.06 -0.26 -0.11
## activity 0.56 0.64 0.71 -0.07 -0.04 0.00 -0.04 0.03 0.09 0.16 0.23
## VERBfrac.m 0.70 0.79 0.85 -0.08 -0.04 0.00 -0.11 -0.03 0.05 0.03 0.10
## wordcount -0.15 -0.12 -0.09 0.90 0.93 0.96 -0.02 0.01 0.06 -0.02 0.02
## entropy 0.02 0.07 0.11 0.71 0.74 0.78 -0.01 0.04 0.10 -0.01 0.04
## compoundVERBs 0.79 0.91 0.99 -0.08 -0.04 0.01 -0.09 -0.02 0.07 -0.07 0.02
## passives -0.01 0.10 0.16 -0.06 -0.02 0.02 -0.06 0.01 0.14 -0.13 -0.02
## verbdist -0.91 -0.84 -0.74 -0.03 0.00 0.02 -0.15 -0.08 -0.02 -0.30 -0.21
## maentropy -0.13 -0.08 -0.04 -0.08 -0.05 -0.03 -0.02 0.01 0.06 -0.01 0.04
## predorder.m -0.90 -0.72 -0.54 -0.07 -0.03 0.01 -0.33 -0.13 0.06 0.03 0.21
## hapaxes 0.05 0.10 0.15 -0.83 -0.80 -0.76 -0.09 -0.03 0.02 -0.14 -0.08
## VERBcomp 0.42 0.53 0.59 -0.02 0.03 0.07 -0.19 -0.14 -0.05 0.43 0.50
## subj 0.63 0.73 0.80 0.06 0.12 0.17 -0.07 0.01 0.10 -0.07 0.00
## NOUNcount.m -1.00 -0.94 -0.82 0.01 0.05 0.09 -0.21 -0.12 -0.05 -0.18 -0.11
## NEGcount.m -0.12 -0.07 -0.01 -0.09 -0.06 -0.02 0.73 0.85 0.94 0.07 0.11
## NEGcount.v 0.12 0.16 0.21 0.03 0.07 0.11 0.70 0.81 0.93 -0.03 0.02
## mamr 0.74 0.82 0.88 -0.10 -0.05 0.01 -0.21 -0.09 0.03 -0.06 0.04
## obj -0.12 -0.05 0.03 -0.07 -0.02 0.04 0.01 0.10 0.23 0.67 0.75
## predorder.v -0.54 -0.45 -0.34 0.03 0.10 0.17 -0.13 -0.01 0.12 0.14 0.27
## upper low PA6 upper low PA5 upper
## sentlen.m 0.50 0.16 0.22 0.27 -0.02 0.02 0.06
## sentcount -0.13 -0.27 -0.20 -0.13 -0.09 -0.06 -0.02
## atl 0.04 -0.12 0.01 0.18 0.12 0.20 0.30
## activity 0.33 -0.56 -0.39 -0.27 -0.15 -0.10 -0.05
## VERBfrac.m 0.19 -0.35 -0.23 -0.14 -0.10 -0.06 -0.01
## wordcount 0.06 -0.03 0.02 0.06 -0.01 0.03 0.05
## entropy 0.08 0.02 0.07 0.12 0.38 0.43 0.49
## compoundVERBs 0.09 0.26 0.38 0.50 -0.05 0.02 0.08
## passives 0.03 0.72 0.81 0.90 -0.04 0.03 0.08
## verbdist -0.14 -0.01 0.11 0.26 -0.13 -0.09 -0.05
## maentropy 0.07 -0.01 0.05 0.09 0.77 0.87 0.94
## predorder.m 0.43 -0.10 0.06 0.24 -0.11 -0.04 0.03
## hapaxes -0.04 -0.13 -0.05 0.02 0.23 0.28 0.32
## VERBcomp 0.59 -0.29 -0.15 -0.07 0.00 0.07 0.11
## subj 0.05 0.07 0.19 0.33 -0.23 -0.14 -0.07
## NOUNcount.m -0.04 -0.12 -0.05 0.04 -0.03 0.02 0.08
## NEGcount.m 0.22 -0.05 0.00 0.12 -0.02 0.01 0.07
## NEGcount.v 0.13 -0.06 0.01 0.13 -0.04 0.00 0.06
## mamr 0.13 0.01 0.16 0.32 -0.28 -0.21 -0.14
## obj 0.87 -0.15 -0.03 0.05 -0.05 0.01 0.06
## predorder.v 0.44 -0.06 0.08 0.24 -0.05 0.02 0.11
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 0.01699 0.094 0.1824
## PA1-PA4 -0.62387 -0.237 -0.0062
## PA1-PA3 -0.64202 -0.088 0.1442
## PA1-PA6 -0.63261 -0.496 0.1214
## PA1-PA5 -0.44579 -0.213 -0.0445
## PA2-PA4 -0.00023 0.306 0.4609
## PA2-PA3 -0.03279 0.221 0.4123
## PA2-PA6 -0.04253 0.056 0.3495
## PA2-PA5 -0.00632 0.132 0.2296
## PA4-PA3 0.06815 0.262 0.4383
## PA4-PA6 -0.03746 0.307 0.4471
## PA4-PA5 -0.04219 0.223 0.3715
## PA3-PA6 -0.25044 -0.036 0.3450
## PA3-PA5 -0.15745 -0.072 0.2782
## PA6-PA5 -0.18232 0.037 0.1791
fa_2$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_engineered_1)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 21 × 2
## feat maxload
## <chr> <dbl>
## 1 predorder.v 0.447
## 2 VERBcomp 0.525
## 3 activity 0.643
## 4 atl 0.666
## 5 predorder.m 0.722
## 6 sentlen.m 0.726
## 7 subj 0.730
## 8 entropy 0.742
## 9 obj 0.752
## 10 VERBfrac.m 0.788
## # ℹ 11 more rows
fa_2$communality %>% sort()
## predorder.v atl subj passives predorder.m
## 0.3455333 0.4812761 0.5190796 0.5870862 0.5894059
## VERBcomp obj compoundVERBs NEGcount.v hapaxes
## 0.5980092 0.6150300 0.6245015 0.6842486 0.7049363
## mamr verbdist maentropy NOUNcount.m NEGcount.m
## 0.7247364 0.7820726 0.7849233 0.7957126 0.8035536
## entropy activity VERBfrac.m wordcount sentcount
## 0.8725579 0.8730513 0.8803732 0.8886349 0.9162827
## sentlen.m
## 0.9605998
fa_2$communality[fa_2$communality < 0.5] %>% names()
## [1] "atl" "predorder.v"
fa_2$complexity %>% sort()
## maentropy wordcount passives obj NEGcount.m
## 1.033820 1.037321 1.040238 1.044842 1.062761
## NOUNcount.m NEGcount.v verbdist VERBfrac.m mamr
## 1.072662 1.100447 1.209877 1.218738 1.236964
## predorder.m subj atl sentcount hapaxes
## 1.260679 1.269337 1.283860 1.284358 1.307339
## compoundVERBs entropy predorder.v sentlen.m activity
## 1.335633 1.669784 1.819032 1.840314 2.017190
## VERBcomp
## 2.339645
fa_2$complexity[fa_2$complexity > 2] %>% names()
## [1] "activity" "VERBcomp"
data_engineered_2 <- data_engineered_1 %>%
# remove low-communality features
select(!c(
predorder.v,
atl
))
det(cor(data_engineered_2))
## [1] 5.109255e-08
KMO(data_engineered_2)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_engineered_2)
## Overall MSA = 0.84
## MSA for each item =
## sentlen.m sentcount activity VERBfrac.m wordcount
## 0.84 0.71 0.90 0.90 0.70
## entropy compoundVERBs passives verbdist maentropy
## 0.73 0.91 0.80 0.92 0.62
## predorder.m hapaxes VERBcomp subj NOUNcount.m
## 0.89 0.79 0.88 0.94 0.92
## NEGcount.m NEGcount.v mamr obj
## 0.73 0.67 0.90 0.57
fa.parallel(data_engineered_2, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
set.seed(42)
fa_3 <- fa(
data_engineered_2,
nfactors = 5,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
fa_3
## Factor Analysis with confidence intervals using method = fa(r = data_engineered_2, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_engineered_2, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA3 PA4 PA5 h2 u2 com
## sentlen.m -0.88 0.03 0.23 0.09 -0.12 0.90 0.097 1.2
## sentcount 0.25 0.90 -0.23 0.04 0.03 0.89 0.109 1.3
## activity 0.61 -0.05 -0.04 0.52 -0.02 0.88 0.115 2.0
## VERBfrac.m 0.76 -0.05 -0.08 0.32 -0.02 0.90 0.105 1.4
## wordcount -0.14 0.94 0.03 0.00 0.03 0.89 0.106 1.0
## entropy 0.02 0.75 0.17 0.02 0.41 0.85 0.151 1.7
## compoundVERBs 0.81 -0.03 0.13 -0.16 -0.10 0.56 0.437 1.2
## passives -0.04 0.00 0.27 -0.51 -0.13 0.35 0.649 1.7
## verbdist -0.75 0.00 -0.15 -0.33 -0.08 0.78 0.224 1.5
## maentropy -0.16 0.00 0.19 0.03 0.68 0.54 0.461 1.3
## predorder.m -0.78 -0.02 -0.04 0.04 -0.12 0.57 0.435 1.1
## hapaxes 0.13 -0.80 -0.03 0.00 0.31 0.73 0.268 1.4
## VERBcomp 0.30 0.05 0.03 0.58 -0.05 0.57 0.432 1.5
## subj 0.67 0.12 0.07 -0.08 -0.22 0.51 0.485 1.3
## NOUNcount.m -0.90 0.06 -0.20 -0.12 0.03 0.81 0.190 1.1
## NEGcount.m 0.03 -0.07 0.87 -0.05 0.16 0.70 0.295 1.1
## NEGcount.v 0.28 0.05 0.81 -0.10 0.15 0.61 0.390 1.3
## mamr 0.74 -0.05 -0.04 -0.01 -0.32 0.72 0.276 1.4
## obj -0.30 0.01 0.40 0.59 -0.14 0.60 0.403 2.5
##
## PA1 PA2 PA3 PA4 PA5
## SS loadings 5.92 2.92 1.81 1.71 1.02
## Proportion Var 0.31 0.15 0.10 0.09 0.05
## Cumulative Var 0.31 0.47 0.56 0.65 0.70
## Proportion Explained 0.44 0.22 0.14 0.13 0.08
## Cumulative Proportion 0.44 0.66 0.80 0.92 1.00
##
## With factor correlations of
## PA1 PA2 PA3 PA4 PA5
## PA1 1.00 0.10 -0.32 0.35 -0.13
## PA2 0.10 1.00 0.33 0.10 0.03
## PA3 -0.32 0.33 1.00 0.00 -0.09
## PA4 0.35 0.10 0.00 1.00 -0.17
## PA5 -0.13 0.03 -0.09 -0.17 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 171 with the objective function = 16.79 with Chi Square = 12505.47
## df of the model are 86 and the objective function was 1.95
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic n.obs is 753 with the empirical chi square 259.11 with prob < 3e-19
## The total n.obs was 753 with Likelihood Chi Square = 1443.45 with prob < 3.1e-245
##
## Tucker Lewis Index of factoring reliability = 0.78
## RMSEA index = 0.145 and the 90 % confidence intervals are 0.138 0.151
## BIC = 873.78
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## PA1 PA2 PA3 PA4 PA5
## Correlation of (regression) scores with factors 0.98 0.98 0.93 0.93 0.89
## Multiple R square of scores with factors 0.97 0.95 0.87 0.86 0.79
## Minimum correlation of possible factor scores 0.94 0.91 0.74 0.72 0.57
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA3 upper low PA4
## sentlen.m -0.97 -0.88 -0.77 -0.02 0.03 0.06 0.18 0.23 0.32 0.02 0.09
## sentcount 0.20 0.25 0.31 0.87 0.90 0.94 -0.30 -0.23 -0.17 0.00 0.04
## activity 0.53 0.61 0.69 -0.08 -0.05 -0.02 -0.09 -0.04 0.01 0.43 0.52
## VERBfrac.m 0.68 0.76 0.85 -0.08 -0.05 -0.02 -0.13 -0.08 -0.02 0.25 0.32
## wordcount -0.16 -0.14 -0.10 0.91 0.94 0.96 0.00 0.03 0.06 -0.03 0.00
## entropy -0.03 0.02 0.08 0.70 0.75 0.79 0.11 0.17 0.25 -0.05 0.02
## compoundVERBs 0.73 0.81 0.92 -0.09 -0.03 0.03 0.06 0.13 0.22 -0.27 -0.16
## passives -0.15 -0.04 0.07 -0.06 0.00 0.06 0.15 0.27 0.39 -0.64 -0.51
## verbdist -0.83 -0.75 -0.69 -0.02 0.00 0.03 -0.24 -0.15 -0.07 -0.41 -0.33
## maentropy -0.24 -0.16 -0.09 -0.04 0.00 0.04 0.14 0.19 0.28 -0.06 0.03
## predorder.m -0.89 -0.78 -0.70 -0.06 -0.02 0.03 -0.11 -0.04 0.06 -0.07 0.04
## hapaxes 0.09 0.13 0.16 -0.83 -0.80 -0.77 -0.07 -0.03 0.02 -0.05 0.00
## VERBcomp 0.22 0.30 0.41 -0.01 0.05 0.11 -0.04 0.03 0.11 0.43 0.58
## subj 0.60 0.67 0.74 0.07 0.12 0.17 0.01 0.07 0.13 -0.15 -0.08
## NOUNcount.m -0.98 -0.90 -0.82 0.02 0.06 0.09 -0.26 -0.20 -0.14 -0.17 -0.12
## NEGcount.m -0.08 0.03 0.10 -0.10 -0.07 -0.02 0.77 0.87 0.98 -0.12 -0.05
## NEGcount.v 0.17 0.28 0.36 0.01 0.05 0.10 0.73 0.81 0.91 -0.16 -0.10
## mamr 0.67 0.74 0.82 -0.09 -0.05 0.00 -0.10 -0.04 0.02 -0.08 -0.01
## obj -0.39 -0.30 -0.20 -0.03 0.01 0.05 0.33 0.40 0.50 0.48 0.59
## upper low PA5 upper
## sentlen.m 0.13 -0.19 -0.12 -0.04
## sentcount 0.08 -0.02 0.03 0.09
## activity 0.61 -0.07 -0.02 0.03
## VERBfrac.m 0.38 -0.08 -0.02 0.03
## wordcount 0.04 0.00 0.03 0.07
## entropy 0.06 0.35 0.41 0.51
## compoundVERBs -0.07 -0.19 -0.10 0.01
## passives -0.38 -0.25 -0.13 -0.02
## verbdist -0.25 -0.15 -0.08 -0.01
## maentropy 0.09 0.59 0.68 0.87
## predorder.m 0.10 -0.23 -0.12 0.00
## hapaxes 0.04 0.26 0.31 0.38
## VERBcomp 0.67 -0.13 -0.05 0.02
## subj 0.00 -0.31 -0.22 -0.13
## NOUNcount.m -0.06 -0.03 0.03 0.08
## NEGcount.m 0.04 0.06 0.16 0.25
## NEGcount.v 0.01 0.06 0.15 0.25
## mamr 0.05 -0.45 -0.32 -0.23
## obj 0.66 -0.22 -0.14 -0.05
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 -0.039 0.0988 0.250
## PA1-PA3 -0.764 -0.3161 0.492
## PA1-PA4 -0.521 0.3484 0.647
## PA1-PA5 -0.359 -0.1312 0.053
## PA2-PA3 -0.010 0.3284 0.487
## PA2-PA4 -0.145 0.0957 0.428
## PA2-PA5 -0.079 0.0314 0.200
## PA3-PA4 -0.218 0.0017 0.124
## PA3-PA5 -0.361 -0.0882 0.245
## PA4-PA5 -0.350 -0.1696 0.375
fa_3$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_engineered_2)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 19 × 2
## feat maxload
## <chr> <dbl>
## 1 passives 0.507
## 2 VERBcomp 0.576
## 3 obj 0.587
## 4 activity 0.613
## 5 subj 0.673
## 6 maentropy 0.677
## 7 mamr 0.737
## 8 entropy 0.748
## 9 verbdist 0.750
## 10 VERBfrac.m 0.760
## 11 predorder.m 0.780
## 12 hapaxes 0.805
## 13 NEGcount.v 0.810
## 14 compoundVERBs 0.811
## 15 NEGcount.m 0.865
## 16 sentlen.m 0.877
## 17 NOUNcount.m 0.896
## 18 sentcount 0.900
## 19 wordcount 0.935
fa_3$communality %>% sort()
## passives subj maentropy compoundVERBs predorder.m
## 0.3509693 0.5146531 0.5394025 0.5631851 0.5651185
## VERBcomp obj NEGcount.v NEGcount.m mamr
## 0.5675835 0.5970745 0.6096614 0.7045599 0.7237484
## hapaxes verbdist NOUNcount.m entropy activity
## 0.7318879 0.7763733 0.8096764 0.8489362 0.8848291
## sentcount wordcount VERBfrac.m sentlen.m
## 0.8907903 0.8936181 0.8952361 0.9031961
fa_3$communality[fa_3$communality < 0.5] %>% names()
## [1] "passives"
fa_3$complexity %>% sort()
## wordcount predorder.m NEGcount.m NOUNcount.m compoundVERBs
## 1.045774 1.061340 1.086638 1.144807 1.166381
## sentlen.m maentropy sentcount subj NEGcount.v
## 1.206245 1.294684 1.302652 1.326581 1.347438
## hapaxes VERBfrac.m mamr verbdist VERBcomp
## 1.351258 1.382135 1.387440 1.495842 1.549356
## entropy passives activity obj
## 1.684038 1.689338 1.974918 2.469330
fa_3$complexity[fa_3$complexity > 2] %>% names()
## [1] "obj"
data_engineered_3 <- data_engineered_2 %>%
# remove low-communality features
select(!c(
passives
))
det(cor(data_engineered_3))
## [1] 9.330367e-08
KMO(data_engineered_3)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_engineered_3)
## Overall MSA = 0.84
## MSA for each item =
## sentlen.m sentcount activity VERBfrac.m wordcount
## 0.83 0.70 0.90 0.89 0.70
## entropy compoundVERBs verbdist maentropy predorder.m
## 0.72 0.92 0.91 0.61 0.89
## hapaxes VERBcomp subj NOUNcount.m NEGcount.m
## 0.79 0.87 0.94 0.92 0.72
## NEGcount.v mamr obj
## 0.66 0.89 0.56
fa.parallel(data_engineered_3, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
set.seed(42)
fa_4 <- fa(
data_engineered_3,
nfactors = 5,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
fa_4
## Factor Analysis with confidence intervals using method = fa(r = data_engineered_3, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_engineered_3, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA3 PA5 PA4 h2 u2 com
## sentlen.m -0.86 0.04 0.10 0.37 -0.06 0.90 0.096 1.4
## sentcount 0.28 0.88 -0.15 -0.12 0.01 0.87 0.129 1.3
## activity 0.84 -0.06 -0.09 0.26 0.01 0.81 0.188 1.2
## VERBfrac.m 0.91 -0.06 -0.10 0.10 0.00 0.88 0.121 1.1
## wordcount -0.15 0.94 0.01 0.04 0.03 0.89 0.108 1.1
## entropy 0.03 0.76 0.07 -0.01 0.43 0.87 0.133 1.6
## compoundVERBs 0.71 -0.02 0.10 -0.13 -0.05 0.51 0.492 1.1
## verbdist -0.89 0.00 -0.06 -0.19 -0.11 0.78 0.222 1.1
## maentropy -0.10 -0.02 0.04 -0.03 0.79 0.69 0.314 1.0
## predorder.m -0.76 -0.02 -0.07 0.19 -0.10 0.56 0.437 1.2
## hapaxes 0.16 -0.81 -0.06 -0.11 0.31 0.73 0.270 1.4
## VERBcomp 0.56 0.04 -0.15 0.49 0.06 0.60 0.396 2.2
## subj 0.61 0.12 0.08 -0.06 -0.19 0.48 0.517 1.3
## NOUNcount.m -0.93 0.05 -0.14 -0.04 0.00 0.81 0.194 1.1
## NEGcount.m -0.05 -0.07 0.83 0.14 0.04 0.76 0.239 1.1
## NEGcount.v 0.20 0.05 0.84 0.03 0.02 0.71 0.293 1.1
## mamr 0.72 -0.04 -0.01 -0.03 -0.27 0.70 0.305 1.3
## obj -0.07 0.01 0.14 0.78 -0.04 0.67 0.330 1.1
##
## PA1 PA2 PA3 PA5 PA4
## SS loadings 6.41 2.91 1.57 1.24 1.09
## Proportion Var 0.36 0.16 0.09 0.07 0.06
## Cumulative Var 0.36 0.52 0.60 0.67 0.73
## Proportion Explained 0.48 0.22 0.12 0.09 0.08
## Cumulative Proportion 0.48 0.70 0.82 0.92 1.00
##
## With factor correlations of
## PA1 PA2 PA3 PA5 PA4
## PA1 1.00 0.13 -0.26 0.01 -0.26
## PA2 0.13 1.00 0.30 0.15 0.10
## PA3 -0.26 0.30 1.00 0.16 0.22
## PA5 0.01 0.15 0.16 1.00 0.02
## PA4 -0.26 0.10 0.22 0.02 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 153 with the objective function = 16.19 with Chi Square = 12062.32
## df of the model are 73 and the objective function was 1.62
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic n.obs is 753 with the empirical chi square 137.03 with prob < 8.5e-06
## The total n.obs was 753 with Likelihood Chi Square = 1205.36 with prob < 1.5e-204
##
## Tucker Lewis Index of factoring reliability = 0.8
## RMSEA index = 0.144 and the 90 % confidence intervals are 0.137 0.151
## BIC = 721.81
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA3 PA5 PA4
## Correlation of (regression) scores with factors 0.99 0.98 0.93 0.92 0.90
## Multiple R square of scores with factors 0.97 0.95 0.86 0.84 0.81
## Minimum correlation of possible factor scores 0.94 0.91 0.72 0.68 0.62
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA3 upper low PA5
## sentlen.m -0.89 -0.86 -0.81 0.00 0.04 0.07 0.05 0.10 0.15 0.32 0.37
## sentcount 0.25 0.28 0.31 0.84 0.88 0.92 -0.19 -0.15 -0.11 -0.15 -0.12
## activity 0.82 0.84 0.87 -0.09 -0.06 -0.02 -0.14 -0.09 -0.04 0.22 0.26
## VERBfrac.m 0.88 0.91 0.94 -0.09 -0.06 -0.02 -0.14 -0.10 -0.04 0.05 0.10
## wordcount -0.18 -0.15 -0.12 0.91 0.94 0.97 -0.02 0.01 0.04 0.01 0.04
## entropy -0.01 0.03 0.07 0.72 0.76 0.79 0.03 0.07 0.11 -0.05 -0.01
## compoundVERBs 0.66 0.71 0.77 -0.07 -0.02 0.04 0.02 0.10 0.17 -0.20 -0.13
## verbdist -0.99 -0.89 -0.81 -0.03 0.00 0.03 -0.12 -0.06 0.01 -0.23 -0.19
## maentropy -0.14 -0.10 -0.07 -0.04 -0.02 0.01 0.00 0.04 0.09 -0.07 -0.03
## predorder.m -0.87 -0.76 -0.68 -0.06 -0.02 0.03 -0.16 -0.07 0.06 0.07 0.19
## hapaxes 0.12 0.16 0.20 -0.84 -0.81 -0.78 -0.10 -0.06 -0.01 -0.15 -0.11
## VERBcomp 0.52 0.56 0.62 0.00 0.04 0.09 -0.21 -0.15 -0.09 0.40 0.49
## subj 0.55 0.61 0.68 0.06 0.12 0.17 0.02 0.08 0.14 -0.12 -0.06
## NOUNcount.m -0.96 -0.93 -0.90 0.02 0.05 0.09 -0.20 -0.14 -0.09 -0.09 -0.04
## NEGcount.m -0.08 -0.05 -0.01 -0.10 -0.07 -0.03 0.76 0.83 0.91 0.10 0.14
## NEGcount.v 0.16 0.20 0.23 0.02 0.05 0.09 0.77 0.84 0.91 0.00 0.03
## mamr 0.66 0.72 0.77 -0.09 -0.04 0.01 -0.09 -0.01 0.06 -0.11 -0.03
## obj -0.11 -0.07 -0.03 -0.02 0.01 0.05 0.09 0.14 0.20 0.71 0.78
## upper low PA4 upper
## sentlen.m 0.43 -0.10 -0.06 -0.02
## sentcount -0.08 -0.04 0.01 0.05
## activity 0.31 -0.05 0.01 0.05
## VERBfrac.m 0.15 -0.05 0.00 0.05
## wordcount 0.07 0.00 0.03 0.07
## entropy 0.02 0.38 0.43 0.49
## compoundVERBs -0.06 -0.11 -0.05 0.02
## verbdist -0.14 -0.16 -0.11 -0.06
## maentropy 0.01 0.71 0.79 0.87
## predorder.m 0.29 -0.17 -0.10 -0.02
## hapaxes -0.07 0.26 0.31 0.36
## VERBcomp 0.59 0.01 0.06 0.11
## subj 0.00 -0.28 -0.19 -0.09
## NOUNcount.m 0.01 -0.04 0.00 0.05
## NEGcount.m 0.19 -0.01 0.04 0.09
## NEGcount.v 0.08 -0.02 0.02 0.06
## mamr 0.04 -0.35 -0.27 -0.20
## obj 0.86 -0.09 -0.04 0.01
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 0.058 0.1305 0.21
## PA1-PA3 -0.346 -0.2550 -0.14
## PA1-PA5 -0.256 0.0081 0.19
## PA1-PA4 -0.402 -0.2633 -0.10
## PA2-PA3 0.229 0.3041 0.37
## PA2-PA5 0.040 0.1474 0.23
## PA2-PA4 -0.013 0.1017 0.22
## PA3-PA5 0.018 0.1603 0.30
## PA3-PA4 0.056 0.2182 0.37
## PA5-PA4 -0.119 0.0248 0.18
fa_4$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_engineered_3)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 18 × 2
## feat maxload
## <chr> <dbl>
## 1 VERBcomp 0.564
## 2 subj 0.615
## 3 compoundVERBs 0.712
## 4 mamr 0.716
## 5 entropy 0.755
## 6 predorder.m 0.761
## 7 obj 0.778
## 8 maentropy 0.788
## 9 hapaxes 0.808
## 10 NEGcount.m 0.834
## 11 NEGcount.v 0.839
## 12 activity 0.844
## 13 sentlen.m 0.856
## 14 sentcount 0.881
## 15 verbdist 0.892
## 16 VERBfrac.m 0.909
## 17 NOUNcount.m 0.929
## 18 wordcount 0.935
fa_4$communality %>% sort()
## subj compoundVERBs predorder.m VERBcomp obj
## 0.4833988 0.5080629 0.5628197 0.6043396 0.6698230
## maentropy mamr NEGcount.v hapaxes NEGcount.m
## 0.6860621 0.6950548 0.7071884 0.7297487 0.7608026
## verbdist NOUNcount.m activity entropy sentcount
## 0.7779659 0.8063441 0.8124988 0.8669604 0.8710431
## VERBfrac.m wordcount sentlen.m
## 0.8792764 0.8922747 0.9039542
fa_4$communality[fa_4$communality < 0.5] %>% names()
## [1] "subj"
fa_4$complexity %>% sort()
## maentropy wordcount VERBfrac.m NOUNcount.m NEGcount.m
## 1.040417 1.056721 1.057438 1.057838 1.080802
## obj compoundVERBs NEGcount.v verbdist predorder.m
## 1.089701 1.110837 1.120287 1.126815 1.171046
## activity mamr sentcount subj sentlen.m
## 1.227151 1.297926 1.299314 1.324019 1.415335
## hapaxes entropy VERBcomp
## 1.427792 1.609326 2.170189
fa_4$complexity[fa_4$complexity > 2] %>% names()
## [1] "VERBcomp"
data_engineered_4 <- data_engineered_3 %>%
# remove low-communality features
select(!c(
subj
))
det(cor(data_engineered_4))
## [1] 1.925217e-07
KMO(data_engineered_4)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_engineered_4)
## Overall MSA = 0.82
## MSA for each item =
## sentlen.m sentcount activity VERBfrac.m wordcount
## 0.82 0.69 0.89 0.88 0.70
## entropy compoundVERBs verbdist maentropy predorder.m
## 0.72 0.91 0.91 0.59 0.88
## hapaxes VERBcomp NOUNcount.m NEGcount.m NEGcount.v
## 0.79 0.86 0.91 0.72 0.66
## mamr obj
## 0.88 0.57
fa.parallel(data_engineered_4, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
set.seed(42)
fa_5 <- fa(
data_engineered_4,
nfactors = 5,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
fa_5
## Factor Analysis with confidence intervals using method = fa(r = data_engineered_4, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_engineered_4, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA5 PA3 PA4 h2 u2 com
## sentlen.m -0.83 0.02 0.08 0.39 -0.03 0.90 0.096 1.4
## sentcount 0.27 0.89 -0.13 -0.13 -0.02 0.88 0.118 1.3
## activity 0.86 -0.04 -0.08 0.24 -0.02 0.82 0.183 1.2
## VERBfrac.m 0.92 -0.04 -0.08 0.08 -0.02 0.89 0.112 1.0
## wordcount -0.15 0.94 0.02 0.03 0.02 0.90 0.100 1.1
## entropy 0.02 0.74 0.07 -0.01 0.39 0.84 0.159 1.5
## compoundVERBs 0.70 0.00 0.10 -0.14 -0.05 0.50 0.501 1.1
## verbdist -0.90 -0.01 -0.06 -0.17 -0.09 0.78 0.222 1.1
## maentropy -0.08 -0.05 0.00 0.00 0.90 0.84 0.164 1.0
## predorder.m -0.74 -0.03 -0.08 0.20 -0.07 0.56 0.440 1.2
## hapaxes 0.15 -0.80 -0.06 -0.10 0.30 0.70 0.298 1.4
## VERBcomp 0.59 0.05 -0.15 0.47 0.06 0.60 0.403 2.1
## NOUNcount.m -0.92 0.03 -0.15 -0.03 0.01 0.80 0.203 1.1
## NEGcount.m -0.06 -0.06 0.82 0.14 0.02 0.75 0.255 1.1
## NEGcount.v 0.19 0.06 0.87 0.02 -0.02 0.74 0.261 1.1
## mamr 0.71 -0.03 -0.02 -0.04 -0.24 0.66 0.344 1.2
## obj -0.04 0.00 0.12 0.80 -0.01 0.70 0.299 1.0
##
## PA1 PA2 PA5 PA3 PA4
## SS loadings 5.98 2.89 1.56 1.25 1.16
## Proportion Var 0.35 0.17 0.09 0.07 0.07
## Cumulative Var 0.35 0.52 0.61 0.69 0.76
## Proportion Explained 0.47 0.23 0.12 0.10 0.09
## Cumulative Proportion 0.47 0.69 0.81 0.91 1.00
##
## With factor correlations of
## PA1 PA2 PA5 PA3 PA4
## PA1 1.00 0.11 -0.26 -0.03 -0.25
## PA2 0.11 1.00 0.30 0.16 0.15
## PA5 -0.26 0.30 1.00 0.19 0.27
## PA3 -0.03 0.16 0.19 1.00 0.00
## PA4 -0.25 0.15 0.27 0.00 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 136 with the objective function = 15.46 with Chi Square = 11527.71
## df of the model are 61 and the objective function was 1.4
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 753 with the empirical chi square 104 with prob < 5e-04
## The total n.obs was 753 with Likelihood Chi Square = 1035.68 with prob < 3.3e-177
##
## Tucker Lewis Index of factoring reliability = 0.808
## RMSEA index = 0.146 and the 90 % confidence intervals are 0.138 0.154
## BIC = 631.61
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA5 PA3 PA4
## Correlation of (regression) scores with factors 0.99 0.98 0.93 0.92 0.93
## Multiple R square of scores with factors 0.97 0.95 0.86 0.85 0.87
## Minimum correlation of possible factor scores 0.94 0.91 0.73 0.69 0.73
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA5 upper low PA3
## sentlen.m -0.87 -0.83 -0.79 -0.01 0.02 0.05 0.04 0.08 0.13 0.35 0.39
## sentcount 0.24 0.27 0.31 0.86 0.89 0.93 -0.18 -0.13 -0.09 -0.17 -0.13
## activity 0.83 0.86 0.88 -0.07 -0.04 -0.01 -0.12 -0.08 -0.03 0.19 0.24
## VERBfrac.m 0.89 0.92 0.94 -0.07 -0.04 -0.01 -0.14 -0.08 -0.03 0.04 0.08
## wordcount -0.17 -0.15 -0.12 0.91 0.94 0.96 -0.01 0.02 0.05 0.00 0.03
## entropy -0.02 0.02 0.06 0.71 0.74 0.77 0.03 0.07 0.10 -0.03 -0.01
## compoundVERBs 0.64 0.70 0.76 -0.06 0.00 0.06 0.02 0.10 0.17 -0.19 -0.14
## verbdist -0.99 -0.90 -0.81 -0.04 -0.01 0.02 -0.13 -0.06 0.01 -0.22 -0.17
## maentropy -0.12 -0.08 -0.06 -0.07 -0.05 -0.03 -0.04 0.00 0.04 -0.03 0.00
## predorder.m -0.85 -0.74 -0.66 -0.08 -0.03 0.01 -0.20 -0.08 0.03 0.11 0.20
## hapaxes 0.11 0.15 0.19 -0.83 -0.80 -0.78 -0.11 -0.06 -0.02 -0.14 -0.10
## VERBcomp 0.54 0.59 0.64 0.00 0.05 0.09 -0.19 -0.15 -0.09 0.40 0.47
## NOUNcount.m -0.94 -0.92 -0.89 0.00 0.03 0.07 -0.19 -0.15 -0.10 -0.08 -0.03
## NEGcount.m -0.09 -0.06 -0.02 -0.10 -0.06 -0.02 0.76 0.82 0.89 0.10 0.14
## NEGcount.v 0.15 0.19 0.21 0.03 0.06 0.10 0.78 0.87 0.94 -0.02 0.02
## mamr 0.65 0.71 0.76 -0.08 -0.03 0.02 -0.11 -0.02 0.06 -0.11 -0.04
## obj -0.08 -0.04 0.01 -0.03 0.00 0.04 0.07 0.12 0.18 0.74 0.80
## upper low PA4 upper
## sentlen.m 0.45 -0.06 -0.03 0.01
## sentcount -0.10 -0.06 -0.02 0.02
## activity 0.28 -0.06 -0.02 0.02
## VERBfrac.m 0.12 -0.06 -0.02 0.02
## wordcount 0.06 -0.01 0.02 0.05
## entropy 0.03 0.33 0.39 0.46
## compoundVERBs -0.06 -0.12 -0.05 0.02
## verbdist -0.12 -0.13 -0.09 -0.04
## maentropy 0.03 0.81 0.90 0.99
## predorder.m 0.31 -0.13 -0.07 -0.01
## hapaxes -0.06 0.25 0.30 0.35
## VERBcomp 0.56 0.01 0.06 0.10
## NOUNcount.m 0.02 -0.03 0.01 0.06
## NEGcount.m 0.18 -0.02 0.02 0.06
## NEGcount.v 0.07 -0.06 -0.02 0.02
## mamr 0.04 -0.32 -0.24 -0.18
## obj 0.87 -0.05 -0.01 0.03
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 0.040 0.1101 0.186
## PA1-PA5 -0.331 -0.2553 -0.161
## PA1-PA3 -0.279 -0.0277 0.127
## PA1-PA4 -0.371 -0.2456 -0.045
## PA2-PA5 0.233 0.2995 0.362
## PA2-PA3 0.059 0.1623 0.238
## PA2-PA4 0.066 0.1483 0.245
## PA5-PA3 0.040 0.1893 0.322
## PA5-PA4 0.162 0.2651 0.383
## PA3-PA4 -0.125 0.0033 0.118
fa_5$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_engineered_4)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 17 × 2
## feat maxload
## <chr> <dbl>
## 1 VERBcomp 0.593
## 2 compoundVERBs 0.698
## 3 mamr 0.706
## 4 predorder.m 0.743
## 5 entropy 0.745
## 6 hapaxes 0.802
## 7 obj 0.804
## 8 NEGcount.m 0.820
## 9 sentlen.m 0.833
## 10 activity 0.859
## 11 NEGcount.v 0.865
## 12 sentcount 0.895
## 13 verbdist 0.895
## 14 maentropy 0.896
## 15 VERBfrac.m 0.916
## 16 NOUNcount.m 0.920
## 17 wordcount 0.938
fa_5$communality %>% sort()
## compoundVERBs predorder.m VERBcomp mamr obj
## 0.4987070 0.5604406 0.5969918 0.6556049 0.7014127
## hapaxes NEGcount.v NEGcount.m verbdist NOUNcount.m
## 0.7019362 0.7386412 0.7453366 0.7777822 0.7965697
## activity maentropy entropy sentcount VERBfrac.m
## 0.8166937 0.8355523 0.8410866 0.8823881 0.8884130
## wordcount sentlen.m
## 0.8995476 0.9036277
fa_5$communality[fa_5$communality < 0.5] %>% names()
## [1] "compoundVERBs"
fa_5$complexity %>% sort()
## maentropy VERBfrac.m obj wordcount NOUNcount.m
## 1.025148 1.034315 1.047293 1.052468 1.054850
## NEGcount.m NEGcount.v verbdist compoundVERBs activity
## 1.078114 1.104370 1.104622 1.129211 1.175196
## predorder.m mamr sentcount hapaxes sentlen.m
## 1.188548 1.248753 1.285921 1.395322 1.445528
## entropy VERBcomp
## 1.537031 2.084382
fa_5$complexity[fa_5$complexity > 2] %>% names()
## [1] "VERBcomp"
data_engineered_5 <- data_engineered_4 %>%
# remove low-communality features
select(!c(
compoundVERBs
))
det(cor(data_engineered_5))
## [1] 4.385204e-07
KMO(data_engineered_5)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_engineered_5)
## Overall MSA = 0.81
## MSA for each item =
## sentlen.m sentcount activity VERBfrac.m wordcount entropy
## 0.81 0.69 0.88 0.87 0.70 0.73
## verbdist maentropy predorder.m hapaxes VERBcomp NOUNcount.m
## 0.90 0.57 0.87 0.79 0.85 0.90
## NEGcount.m NEGcount.v mamr obj
## 0.71 0.66 0.88 0.61
fa.parallel(data_engineered_5, fm = "pa", fa = "fa", n.iter = 20)
## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
final_collist <- names(data_engineered_5)
set.seed(42)
fa_res <- fa(
data_engineered_5,
nfactors = 5,
fm = "pa",
rotate = "promax",
oblique.scores = TRUE,
scores = "tenBerge",
n.iter = 100
)
fa_res
## Factor Analysis with confidence intervals using method = fa(r = data_engineered_5, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = data_engineered_5, nfactors = 5, n.iter = 100, rotate = "promax",
## scores = "tenBerge", fm = "pa", oblique.scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA5 PA3 PA4 h2 u2 com
## sentlen.m -0.82 0.02 0.04 0.46 -0.02 0.95 0.047 1.6
## sentcount 0.27 0.90 -0.12 -0.16 -0.03 0.89 0.108 1.3
## activity 0.89 -0.05 -0.06 0.20 -0.02 0.83 0.174 1.1
## VERBfrac.m 0.92 -0.04 -0.06 0.04 -0.03 0.89 0.114 1.0
## wordcount -0.14 0.94 0.01 0.04 0.02 0.90 0.100 1.1
## entropy 0.02 0.75 0.06 0.00 0.38 0.83 0.167 1.5
## verbdist -0.91 -0.01 -0.08 -0.13 -0.08 0.78 0.217 1.1
## maentropy -0.08 -0.05 -0.02 0.00 0.93 0.88 0.120 1.0
## predorder.m -0.72 -0.03 -0.10 0.23 -0.05 0.56 0.437 1.3
## hapaxes 0.14 -0.80 -0.06 -0.11 0.29 0.70 0.303 1.4
## VERBcomp 0.64 0.05 -0.15 0.47 0.06 0.61 0.392 2.0
## NOUNcount.m -0.90 0.03 -0.15 0.00 0.02 0.78 0.224 1.1
## NEGcount.m -0.07 -0.06 0.82 0.14 0.01 0.75 0.246 1.1
## NEGcount.v 0.16 0.06 0.86 0.02 -0.03 0.73 0.267 1.1
## mamr 0.69 -0.03 -0.02 -0.05 -0.24 0.63 0.369 1.3
## obj 0.03 0.00 0.11 0.77 0.00 0.64 0.356 1.0
##
## PA1 PA2 PA5 PA3 PA4
## SS loadings 5.51 2.90 1.53 1.24 1.19
## Proportion Var 0.34 0.18 0.10 0.08 0.07
## Cumulative Var 0.34 0.53 0.62 0.70 0.77
## Proportion Explained 0.45 0.23 0.12 0.10 0.10
## Cumulative Proportion 0.45 0.68 0.80 0.90 1.00
##
## With factor correlations of
## PA1 PA2 PA5 PA3 PA4
## PA1 1.00 0.11 -0.24 -0.08 -0.25
## PA2 0.11 1.00 0.31 0.16 0.15
## PA5 -0.24 0.31 1.00 0.22 0.28
## PA3 -0.08 0.16 0.22 1.00 0.02
## PA4 -0.25 0.15 0.28 0.02 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 120 with the objective function = 14.64 with Chi Square = 10918.9
## df of the model are 50 and the objective function was 1.12
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 753 with the empirical chi square 69.14 with prob < 0.038
## The total n.obs was 753 with Likelihood Chi Square = 834.38 with prob < 8.6e-143
##
## Tucker Lewis Index of factoring reliability = 0.825
## RMSEA index = 0.144 and the 90 % confidence intervals are 0.136 0.153
## BIC = 503.18
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA5 PA3 PA4
## Correlation of (regression) scores with factors 0.99 0.98 0.93 0.94 0.95
## Multiple R square of scores with factors 0.97 0.96 0.86 0.88 0.90
## Minimum correlation of possible factor scores 0.95 0.91 0.72 0.76 0.79
##
## Coefficients and bootstrapped confidence intervals
## low PA1 upper low PA2 upper low PA5 upper low PA3
## sentlen.m -0.86 -0.82 -0.77 -0.01 0.02 0.05 -0.01 0.04 0.08 0.42 0.46
## sentcount 0.24 0.27 0.31 0.86 0.90 0.93 -0.16 -0.12 -0.07 -0.20 -0.16
## activity 0.86 0.89 0.91 -0.08 -0.05 -0.01 -0.10 -0.06 -0.02 0.16 0.20
## VERBfrac.m 0.89 0.92 0.95 -0.08 -0.04 -0.01 -0.11 -0.06 -0.02 0.00 0.04
## wordcount -0.17 -0.14 -0.12 0.92 0.94 0.96 -0.02 0.01 0.05 0.02 0.04
## entropy -0.02 0.02 0.05 0.72 0.75 0.78 0.02 0.06 0.10 -0.03 0.00
## verbdist -1.00 -0.91 -0.84 -0.04 -0.01 0.01 -0.13 -0.08 -0.03 -0.19 -0.13
## maentropy -0.10 -0.08 -0.05 -0.07 -0.05 -0.03 -0.05 -0.02 0.02 -0.03 0.00
## predorder.m -0.83 -0.72 -0.65 -0.07 -0.03 0.01 -0.20 -0.10 0.01 0.12 0.23
## hapaxes 0.10 0.14 0.18 -0.83 -0.80 -0.77 -0.11 -0.06 -0.01 -0.15 -0.11
## VERBcomp 0.60 0.64 0.67 0.00 0.05 0.10 -0.21 -0.15 -0.10 0.39 0.47
## NOUNcount.m -0.94 -0.90 -0.87 -0.01 0.03 0.07 -0.22 -0.15 -0.08 -0.05 0.00
## NEGcount.m -0.10 -0.07 -0.04 -0.10 -0.06 -0.02 0.73 0.82 0.91 0.10 0.14
## NEGcount.v 0.13 0.16 0.20 0.03 0.06 0.10 0.77 0.86 0.94 -0.03 0.02
## mamr 0.64 0.69 0.74 -0.08 -0.03 0.03 -0.09 -0.02 0.06 -0.13 -0.05
## obj 0.00 0.03 0.08 -0.03 0.00 0.04 0.07 0.11 0.17 0.71 0.77
## upper low PA4 upper
## sentlen.m 0.50 -0.05 -0.02 0.02
## sentcount -0.12 -0.06 -0.03 0.01
## activity 0.24 -0.06 -0.02 0.02
## VERBfrac.m 0.09 -0.07 -0.03 0.01
## wordcount 0.06 -0.01 0.02 0.05
## entropy 0.03 0.33 0.38 0.46
## verbdist -0.09 -0.13 -0.08 -0.04
## maentropy 0.04 0.84 0.93 1.01
## predorder.m 0.34 -0.11 -0.05 0.00
## hapaxes -0.07 0.25 0.29 0.34
## VERBcomp 0.55 0.01 0.06 0.11
## NOUNcount.m 0.05 -0.02 0.02 0.06
## NEGcount.m 0.20 -0.03 0.01 0.05
## NEGcount.v 0.06 -0.06 -0.03 0.02
## mamr 0.02 -0.31 -0.24 -0.18
## obj 0.85 -0.05 0.00 0.04
##
## Interfactor correlations and bootstrapped confidence intervals
## lower estimate upper
## PA1-PA2 -0.124 0.111 0.26
## PA1-PA5 -0.533 -0.240 0.25
## PA1-PA3 -0.328 -0.079 0.15
## PA1-PA4 -0.496 -0.249 0.22
## PA2-PA5 0.234 0.306 0.37
## PA2-PA3 0.054 0.158 0.24
## PA2-PA4 0.060 0.146 0.24
## PA5-PA3 0.077 0.225 0.37
## PA5-PA4 0.152 0.275 0.39
## PA3-PA4 -0.111 0.015 0.14
fa_res$loadings[] %>%
as_tibble() %>%
mutate(feat = colnames(data_engineered_5)) %>%
select(feat, everything()) %>%
pivot_longer(!feat) %>%
mutate(value = abs(value)) %>%
group_by(feat) %>%
summarize(maxload = max(value)) %>%
arrange(maxload)
## # A tibble: 16 × 2
## feat maxload
## <chr> <dbl>
## 1 VERBcomp 0.636
## 2 mamr 0.688
## 3 predorder.m 0.724
## 4 entropy 0.747
## 5 obj 0.773
## 6 hapaxes 0.798
## 7 sentlen.m 0.817
## 8 NEGcount.m 0.819
## 9 NEGcount.v 0.860
## 10 activity 0.888
## 11 sentcount 0.897
## 12 NOUNcount.m 0.904
## 13 verbdist 0.911
## 14 VERBfrac.m 0.922
## 15 maentropy 0.927
## 16 wordcount 0.939
fa_res$communality %>% sort()
## predorder.m VERBcomp mamr obj hapaxes NEGcount.v
## 0.5631195 0.6078405 0.6310865 0.6443121 0.6967551 0.7325530
## NEGcount.m NOUNcount.m verbdist activity entropy maentropy
## 0.7541247 0.7759391 0.7830506 0.8264822 0.8332093 0.8800070
## VERBfrac.m sentcount wordcount sentlen.m
## 0.8862010 0.8917030 0.9002307 0.9530608
fa_res$communality[fa_res$communality < 0.5] %>% names()
## character(0)
fa_res$complexity %>% sort()
## maentropy VERBfrac.m obj wordcount NOUNcount.m verbdist
## 1.020243 1.020505 1.044929 1.051276 1.057854 1.076180
## NEGcount.v NEGcount.m activity mamr predorder.m sentcount
## 1.085446 1.085961 1.119379 1.257157 1.266564 1.288863
## hapaxes entropy sentlen.m VERBcomp
## 1.383313 1.511799 1.582206 2.013233
fa_res$complexity[fa_res$complexity > 2] %>% names()
## [1] "VERBcomp"
Comrey and Lee (1992): loadings excelent > .70 > very good > .63 > good > .55 > fair > .45 > poor > .32
fa.diagram(fa_res)
fa_res$loadings
##
## Loadings:
## PA1 PA2 PA5 PA3 PA4
## sentlen.m -0.817 0.459
## sentcount 0.273 0.897 -0.118 -0.156
## activity 0.888 0.202
## VERBfrac.m 0.922
## wordcount -0.143 0.939
## entropy 0.747 0.384
## verbdist -0.911 -0.134
## maentropy 0.927
## predorder.m -0.724 -0.101 0.235
## hapaxes 0.144 -0.798 -0.108 0.290
## VERBcomp 0.636 -0.154 0.469
## NOUNcount.m -0.904 -0.149
## NEGcount.m 0.819 0.144
## NEGcount.v 0.163 0.860
## mamr 0.688 -0.240
## obj 0.111 0.773
##
## PA1 PA2 PA5 PA3 PA4
## SS loadings 5.508 2.901 1.515 1.205 1.165
## Proportion Var 0.344 0.181 0.095 0.075 0.073
## Cumulative Var 0.344 0.526 0.620 0.696 0.768
for (i in 1:fa_res$factors) {
cat("\n-----", colnames(fa_res$loadings)[i], "-----\n")
loadings <- fa_res$loadings[, i]
load_df <- data.frame(loading = loadings)
load_df_filtered <- load_df %>%
mutate(abs_l = abs(loading)) %>%
mutate(strng = case_when(
abs_l > 0.70 ~ "*****",
abs_l <= 0.70 & abs_l > 0.63 ~ "**** ",
abs_l <= 0.63 & abs_l > 0.55 ~ "*** ",
abs_l <= 0.55 & abs_l > 0.45 ~ "** ",
abs_l <= 0.45 & abs_l > 0.32 ~ "* ",
.default = ""
)) %>%
arrange(-abs_l) %>%
filter(abs_l > 0.1)
load_df_filtered %>%
mutate(across(c(loading, abs_l), ~ round(.x, 3))) %>%
print()
cat("\n")
}
##
## ----- PA1 -----
## loading abs_l strng
## VERBfrac.m 0.922 0.922 *****
## verbdist -0.911 0.911 *****
## NOUNcount.m -0.904 0.904 *****
## activity 0.888 0.888 *****
## sentlen.m -0.817 0.817 *****
## predorder.m -0.724 0.724 *****
## mamr 0.688 0.688 ****
## VERBcomp 0.636 0.636 ****
## sentcount 0.273 0.273
## NEGcount.v 0.163 0.163
## hapaxes 0.144 0.144
## wordcount -0.143 0.143
##
##
## ----- PA2 -----
## loading abs_l strng
## wordcount 0.939 0.939 *****
## sentcount 0.897 0.897 *****
## hapaxes -0.798 0.798 *****
## entropy 0.747 0.747 *****
##
##
## ----- PA5 -----
## loading abs_l strng
## NEGcount.v 0.860 0.860 *****
## NEGcount.m 0.819 0.819 *****
## VERBcomp -0.154 0.154
## NOUNcount.m -0.149 0.149
## sentcount -0.118 0.118
## obj 0.111 0.111
## predorder.m -0.101 0.101
##
##
## ----- PA3 -----
## loading abs_l strng
## obj 0.773 0.773 *****
## VERBcomp 0.469 0.469 **
## sentlen.m 0.459 0.459 **
## predorder.m 0.235 0.235
## activity 0.202 0.202
## sentcount -0.156 0.156
## NEGcount.m 0.144 0.144
## verbdist -0.134 0.134
## hapaxes -0.108 0.108
##
##
## ----- PA4 -----
## loading abs_l strng
## maentropy 0.927 0.927 *****
## entropy 0.384 0.384 *
## hapaxes 0.290 0.290
## mamr -0.240 0.240
hypotheses:
UPOS of passives annotated as ADJ in
UDstrong correlations (but not necessarily significant):
significant correlations (CIs not spanning over 0):
NOTE: variables with low communalities are excluded from the analysis, yet still likely play a role in legal writing readability. this includes both those selected for the analysis and the excluded ones.
NOTE: some high-correlating variables were excluded from the FA.
fa_res$uniquenesses %>% round(3)
## sentlen.m sentcount activity VERBfrac.m wordcount entropy
## 0.047 0.108 0.174 0.114 0.100 0.167
## verbdist maentropy predorder.m hapaxes VERBcomp NOUNcount.m
## 0.217 0.120 0.437 0.303 0.392 0.224
## NEGcount.m NEGcount.v mamr obj
## 0.246 0.267 0.369 0.356
res_data <- data_factor_bind(data_clean, fa_res)
res_data$long %>%
group_by(factor) %>%
summarize(shapiro = shapiro.test(factor_score)$p.value)
## # A tibble: 5 × 2
## factor shapiro
## <fct> <dbl>
## 1 PA1 1.30e-11
## 2 PA2 1.66e-13
## 3 PA5 6.74e- 8
## 4 PA3 1.03e-14
## 5 PA4 1.70e- 8
res_data$long %>%
ggplot(aes(x = factor_score, y = class)) +
facet_grid(factor ~ .) +
theme(legend.position = "bottom") +
geom_jitter(width = 0, height = 0.1, alpha = 0.2)
# analyze_distributions(res_data$long, "class")
# analyze_distributions(res_data$long, "subcorpus")
# analyze_distributions(
# res_data$long %>% filter(subcorpus != "LiFRLaw"), "subcorpus"
# )
# analyze_distributions(res_data$long, "RecipientType")
court decisions often with RecipientType = combined.
# analyze_distributions(res_data$long, "RecipientIndividuation")
# analyze_distributions(res_data$long, "Objectivity")
# analyze_distributions(res_data$long, "Bindingness")
data_factors_correlations <- res_data$feat_long %>%
group_by(feat, factor) %>%
summarize(correlation = cor(feat_value, factor_score))
## `summarise()` has grouped output by 'feat'. You can override using the
## `.groups` argument.
data_factors_correlations %>%
filter(feat %in% final_collist) %>%
ggplot(aes(
x = factor,
y = feat,
fill = correlation,
label = round(correlation, 2)
)) +
geom_tile() +
geom_text() +
scale_fill_gradient2()
data_factors_correlations %>%
filter(!(feat %in% final_collist)) %>%
ggplot(aes(
x = factor,
y = feat,
fill = correlation,
label = round(correlation, 2)
)) +
geom_tile() +
geom_text() +
scale_fill_gradient2()